import psycopg2 from dotenv import load_dotenv import os from openai import OpenAI from langchain_openai import OpenAIEmbeddings from db_management import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity, get_titles_links_embeddings from get_articles import slice_text_at_2k_tokens import json from json_repair import repair_json from publishing_finals import publish_articles load_dotenv() OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") client = OpenAI() embeddings = OpenAIEmbeddings() print("Checking for similar!") def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95): try: titles, links, embeddings = get_titles_links_embeddings() processed_articles = set() grouped_similar_articles = [] for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)): if (title1, link1) not in processed_articles: processed_articles.add((title1, link1)) group = [(title1, link1)] for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)): if i != j and (title2, link2) not in processed_articles: similarity = calculate_cosine_similarity(embedding1, embedding2) if similarity > threshold: if link1 != link2: processed_articles.add((title2, link2)) group.append((title2, link2, embedding2)) grouped_similar_articles.append(group) return grouped_similar_articles except psycopg2.Error as e: print(f"Error: {e}") return [] def processing_articles(articles): unique_links = set() for article in articles: a_title, a_link = article[:2] get_data = get_specific_data(a_title) text = get_data[0][1] link = a_link modify_similar_data(f"C: {', '.join(art[0] for art in articles)}", a_title) preparing_articles(False, a_title) if link not in unique_links: unique_links.add(link) combined_text = ' '.join(get_specific_data(art[0])[0][1] for art in articles) combined_text = slice_text_at_2k_tokens(combined_text) if len(unique_links) == 1: link = next(iter(unique_links)) else: link = ', '.join(unique_links) return combined_text, link def processing_similar(): grouped_similar_articles_result = find_and_group_similar_articles() if grouped_similar_articles_result: for group in grouped_similar_articles_result: articles = group if len(articles) > 1: combined_text, link = processing_articles(articles) user_message = ( rf"Here are {len(articles)} texts {combined_text}, combine the following texts into a cohesive news, " rf"remove any non-news related to all texts, and provide the cleaned data on Bosnian languageas and return as JSON only with a single 'content' field." ) try: completion = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "Data analytic, Journalist and News reporter"}, {"role": "user", "content": user_message} ]) generated_text = repair_json(completion.choices[0].message.content) response_data = json.loads(generated_text) title = articles[0][0] text = response_data["content"] vector = embeddings.embed_query(generated_text) tmpCategory = get_specific_data(title) category = tmpCategory[0][5] insert_data(title, text, link, vector, f"C: {', '.join(art[0] for art in articles)}", category) print(f"Inserting combined: {title} and Category: {category}") except Exception as e: print(f"Error: {e}") print(articles[0][0]) continue else: print("Done!.") else: print("No similar articles found.") if __name__ == "__main__": processing_similar() publish_articles()