Changing from js to golang
This commit is contained in:
122
pyth/checking_similar.py
Normal file
122
pyth/checking_similar.py
Normal file
@@ -0,0 +1,122 @@
|
||||
import psycopg2
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
from openai import OpenAI
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
from db_management import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity, get_titles_links_embeddings
|
||||
from get_articles import slice_text_at_2k_tokens
|
||||
import json
|
||||
from json_repair import repair_json
|
||||
from publishing_finals import publish_articles
|
||||
|
||||
load_dotenv()
|
||||
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
||||
client = OpenAI()
|
||||
embeddings = OpenAIEmbeddings()
|
||||
|
||||
print("Checking for similar!")
|
||||
|
||||
|
||||
def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
|
||||
try:
|
||||
titles, links, embeddings = get_titles_links_embeddings()
|
||||
|
||||
processed_articles = set()
|
||||
grouped_similar_articles = []
|
||||
|
||||
for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
|
||||
if (title1, link1) not in processed_articles:
|
||||
processed_articles.add((title1, link1))
|
||||
group = [(title1, link1)]
|
||||
|
||||
for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
|
||||
if i != j and (title2, link2) not in processed_articles:
|
||||
similarity = calculate_cosine_similarity(embedding1, embedding2)
|
||||
|
||||
if similarity > threshold:
|
||||
if link1 != link2:
|
||||
processed_articles.add((title2, link2))
|
||||
group.append((title2, link2, embedding2))
|
||||
|
||||
grouped_similar_articles.append(group)
|
||||
return grouped_similar_articles
|
||||
|
||||
except psycopg2.Error as e:
|
||||
print(f"Error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def processing_articles(articles):
|
||||
unique_links = set()
|
||||
|
||||
for article in articles:
|
||||
a_title, a_link = article[:2]
|
||||
get_data = get_specific_data(a_title)
|
||||
text = get_data[0][1]
|
||||
link = a_link
|
||||
|
||||
modify_similar_data(f"C: {', '.join(art[0] for art in articles)}", a_title)
|
||||
preparing_articles(False, a_title)
|
||||
|
||||
if link not in unique_links:
|
||||
unique_links.add(link)
|
||||
|
||||
combined_text = ' '.join(get_specific_data(art[0])[0][1] for art in articles)
|
||||
combined_text = slice_text_at_2k_tokens(combined_text)
|
||||
|
||||
if len(unique_links) == 1:
|
||||
link = next(iter(unique_links))
|
||||
else:
|
||||
link = ', '.join(unique_links)
|
||||
return combined_text, link
|
||||
|
||||
|
||||
def processing_similar():
|
||||
grouped_similar_articles_result = find_and_group_similar_articles()
|
||||
|
||||
if grouped_similar_articles_result:
|
||||
for group in grouped_similar_articles_result:
|
||||
articles = group
|
||||
|
||||
if len(articles) > 1:
|
||||
combined_text, link = processing_articles(articles)
|
||||
user_message = (
|
||||
rf"Here are {len(articles)} texts {combined_text}, combine the following texts into a cohesive news, "
|
||||
rf"remove any non-news related to all texts, and provide the cleaned data on Bosnian languageas and return as JSON only with a single 'content' field."
|
||||
)
|
||||
|
||||
try:
|
||||
completion = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
||||
{"role": "user", "content": user_message}
|
||||
])
|
||||
generated_text = repair_json(completion.choices[0].message.content)
|
||||
|
||||
response_data = json.loads(generated_text)
|
||||
title = articles[0][0]
|
||||
text = response_data["content"]
|
||||
vector = embeddings.embed_query(generated_text)
|
||||
tmpCategory = get_specific_data(title)
|
||||
category = tmpCategory[0][5]
|
||||
|
||||
|
||||
|
||||
insert_data(title, text, link, vector, f"C: {', '.join(art[0] for art in articles)}", category)
|
||||
print(f"Inserting combined: {title} and Category: {category}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
print(articles[0][0])
|
||||
continue
|
||||
else:
|
||||
print("Done!.")
|
||||
else:
|
||||
print("No similar articles found.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
processing_similar()
|
||||
publish_articles()
|
||||
Reference in New Issue
Block a user