added article.py

This commit is contained in:
2024-01-06 08:17:05 +01:00
parent ae1c1902da
commit d4e99c7c5f
8 changed files with 329 additions and 18 deletions

View File

@@ -4,7 +4,7 @@ from urllib.parse import urljoin
from openai import OpenAI , APIError
import os
from langchain.embeddings import OpenAIEmbeddings
from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data, delete_specific,get_all_links,cleansing ,modify_similar_data)
from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data,get_all_links,cleansing ,modify_similar_data)
import json
from dotenv import load_dotenv
import tiktoken
@@ -48,6 +48,19 @@ def replace_with_spaces(text):
cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)
return cleaned_text
def fix_links(links_set):
modified_links = set()
for link in links_set:
if "www" in link:
modified_link = link.replace("www.", "")
modified_links.add(modified_link)
else:
modified_links.add(link)
return modified_links
total_links = set()
collected_news = set()
@@ -78,13 +91,13 @@ for dlink in dlinks:
total_links.update(temp_links)
final_links = {item for item in total_links if item}
i = 0
db_links = set(get_all_links())
new_links = final_links - db_links
final_links = new_links
final_links = set(final_links)
final_links = fix_links(final_links)
if __name__ == '__main__':
@@ -142,6 +155,7 @@ if __name__ == '__main__':
print(f"Error in completion: {e}")
continue
def comb_similar():
print("Checking similar")
@@ -185,12 +199,17 @@ def comb_similar():
combined_text = f"{text1}{text2}{text3}"
combined_text = slice_text_at_2k_tokens(combined_text)
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with 'content' field"
link = f"{link1} {link2} {link3}"
if link1 != link2 and link1 != link3 and link2 != link3:
link = f"{link1} {link2} {link3}"
else:
link = link1
else:
user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field."
link = f"{link1} {link2} {link3}"
if link1 != link2 and link1 != link3 and link2 != link3:
link = f"{link1} {link2} {link3}"
else:
link = link1
else:
ftcheck = num_tokens_from_string(f_text)
stcheck = num_tokens_from_string(s_text)
@@ -198,12 +217,17 @@ def comb_similar():
if fscomb <2000:
combined_text = f"{f_text}{s_text}"
user_message = rf"Here is text {combined_text}, combined from 2 sources, filter text, and make news content, return as JSON only with 'content' field"
link = f"{link_f} {link_s}"
if link_f != link_s:
link = f"{link_f} {link_s}"
else:
link = link_f
else:
user_message = rf"Here are 2 texts {f_text} and {s_text}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field."
link = f"{link_f} {link_s}"
if link_f != link_s:
link = f"{link_f} {link_s}"
else:
link = link_f
try:
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
@@ -213,7 +237,6 @@ def comb_similar():
]
)
generated_text = completion.choices[0].message.content
generated_text = generated_text
if similar_article:
if f_title == s_title:
@@ -222,6 +245,7 @@ def comb_similar():
similar_article.remove(sa)
print("Modified")
else:
print(f"First: {f_title}")
print(f"Second: {s_title}")
modify_similar_data(first_t,"SOURCE")
modify_similar_data(second_t,"SOURCE")
@@ -243,5 +267,3 @@ def comb_similar():
except Exception as e:
print(f"Error in completion: {e}")
continue
comb_similar()