import psycopg2 import numpy as np from dotenv import load_dotenv import os from openai import OpenAI from langchain.embeddings import OpenAIEmbeddings from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens import json from json_repair import repair_json load_dotenv() OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") client = OpenAI() embeddings = OpenAIEmbeddings() print(f"Checking for similar!") def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95): try: titles, links, embeddings = get_titles_links_embeddings() processed_articles = set() grouped_similar_articles = [] for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)): if (title1, link1) not in processed_articles: processed_articles.add((title1, link1)) group = [(title1, link1)] for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)): if i != j and (title2, link2) not in processed_articles: similarity = calculate_cosine_similarity(embedding1, embedding2) if similarity > threshold: processed_articles.add((title2, link2)) group.append((title2, link2)) grouped_similar_articles.append(group) return grouped_similar_articles except psycopg2.Error as e: print(f"Error: {e}") return [] def processing_similar(): grouped_similar_articles_result = find_and_group_similar_articles() if grouped_similar_articles_result: for group in grouped_similar_articles_result: articles = [] if len(group) > 1: for article_tuple in group: if len(article_tuple) >= 2: title, link = article_tuple[:2] article = [title, link] articles.append(article) l = len(articles) if l == 2: a_one = articles[0][0] a_two = articles[1][0] get_one = get_specific_data(a_one) get_two = get_specific_data(a_two) text1 = get_one[0][1] text2 = get_two[0][1] link1 = get_one[0][2] link2 = get_two[0][2] if link1 != link2: link = f"{link1}, {link2}" else: link = link1 ftoks = num_tokens_from_string(text1) stoks = num_tokens_from_string(text2) tokens = ftoks + stoks similar_d = f"C: {a_one}, {a_two}" modify_similar_data(similar_d, a_one) preparing_articles(False, a_one) modify_similar_data(similar_d, a_two) preparing_articles(False, a_two) if tokens > 2000: combined_text = f"{text1} {text2}" combined_text = slice_text_at_2k_tokens(combined_text) user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field" else: user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." if l == 3: a_one = articles[0][0] a_two = articles[1][0] a_three = articles[2][0] get_one = get_specific_data(a_one) get_two = get_specific_data(a_two) get_three = get_specific_data(a_three) text1 = get_one[0][1] text2 = get_two[0][1] text3 = get_three[0][1] link1 = get_one[0][2] link2 = get_two[0][2] link3 = get_three[0][2] if link1 != link2: if link2 != link3: link = f"{link1}, {link2}, {link3}" else: link = f"{link1}, {link2}" else: if link2 != link3: link = f"{link1}, {link3}" else: link = link1 ftoks = num_tokens_from_string(text1) stoks = num_tokens_from_string(text2) ttoks = num_tokens_from_string(text3) tokens = ftoks + stoks + ttoks similar_d = f"C: {a_one}, {a_two}, {a_three}" modify_similar_data(similar_d, a_one) preparing_articles(False, a_one) modify_similar_data(similar_d, a_two) preparing_articles(False, a_two) modify_similar_data(similar_d, a_three) preparing_articles(False, a_three) if tokens > 2000: combined_text = f"{text1} {text2} {text3}" combined_text = slice_text_at_2k_tokens(combined_text) user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field" else: user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." if l == 4: a_one = articles[0][0] a_two = articles[1][0] a_three = articles[2][0] a_four = articles[3][0] get_one = get_specific_data(a_one) get_two = get_specific_data(a_two) get_three = get_specific_data(a_three) get_four = get_specific_data(a_four) text1 = get_one[0][1] text2 = get_two[0][1] text3 = get_three[0][1] text4 = get_four[0][1] link1 = get_one[0][2] link2 = get_two[0][2] link3 = get_three[0][2] link4 = get_four[0][2] if link1 != link2: if link2 != link3: if link3 != link4: link = f"{link1}, {link2}, {link3}, {link4}" else: link = f"{link1}, {link2}, {link3}" else: if link3 != link4: link = f"{link1}, {link2}, {link4}" else: link = f"{link1}, {link2}" else: if link2 != link3: if link3 != link4: link = f"{link1}, {link3}, {link4}" else: link = f"{link1}, {link3}" else: if link3 != link4: link = f"{link1}, {link4}" else: link = link1 ftoks = num_tokens_from_string(text1) stoks = num_tokens_from_string(text2) ttoks = num_tokens_from_string(text3) frtoks = num_tokens_from_string(text4) tokens = ftoks + stoks + ttoks + frtoks similar_d = f"C: {a_one}, {a_two}, {a_three}, {a_four}" modify_similar_data(similar_d, a_one) preparing_articles(False, a_one) modify_similar_data(similar_d, a_two) preparing_articles(False, a_two) modify_similar_data(similar_d, a_three) preparing_articles(False, a_three) modify_similar_data(similar_d, a_four) preparing_articles(False, a_four) if tokens > 2000: combined_text = f"{text1} {text2} {text3} {text4}" combined_text = slice_text_at_2k_tokens(combined_text) user_message = rf"Here is text {combined_text}, combined from 4 sources, filter text, and make news content, return as JSON only with a single 'content' field" else: user_message = rf"Here are 4 texts {text1} {text2} {text3} and {text4}, combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single 'content' field." try: completion = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "Data analytic, Journalist and News reporter"}, {"role": "user", "content": user_message} ]) generated_text = completion.choices[0].message.content generated_text = repair_json(generated_text) response_data = json.loads(generated_text) title = a_one text = response_data["content"] vector = embeddings.embed_query(generated_text) insert_data(title, text, link, vector, similar_d) print(f"Inserting combined: {title}") except Exception as e: print(f"Error: {e}") print(a_one) continue else: print("Done!.") else: print("No similar articles found.") if __name__=="__main__": processing_similar()