import psycopg2 import numpy as np from sklearn.metrics.pairwise import cosine_similarity from dotenv import load_dotenv import os from openai import OpenAI , APIError from langchain.embeddings import OpenAIEmbeddings from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, get_source_data, get_ready_data import tiktoken from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens import json load_dotenv() OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") client = OpenAI() embeddings = OpenAIEmbeddings() print(f"Checking for similar!") host = os.getenv("DB_HOST") port = os.getenv("DB_PORT") user = os.getenv("DB_USER") password = os.getenv("DB_PASSWORD") dbname = os.getenv("DB_NAME") def calculate_cosine_similarity(v1, v2): v1_normalized = v1 / np.linalg.norm(v1) v2_normalized = v2 / np.linalg.norm(v2) similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0] return similarity def parse_embedding_string(embedding_str): if isinstance(embedding_str, str): numbers = [float(num) for num in embedding_str[1:-1].split(',')] return np.array(numbers) elif isinstance(embedding_str, np.ndarray): return embedding_str else: raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.") def get_titles_links_embeddings(): conn = psycopg2.connect( host=host, port=port, user=user, password=password, dbname=dbname ) cursor = conn.cursor() cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;') data = cursor.fetchall() cursor.close() titles = [row[0] for row in data] links = [row[1] for row in data] embeddings = [parse_embedding_string(row[2]) for row in data] return titles, links, embeddings def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95): try: conn = psycopg2.connect( host=host, port=port, user=user, password=password, dbname=dbname ) with conn, conn.cursor() as cursor: titles, links, embeddings = get_titles_links_embeddings() processed_articles = set() grouped_similar_articles = [] for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)): if (title1, link1) not in processed_articles: processed_articles.add((title1, link1)) group = [(title1, link1)] for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)): if i != j and (title2, link2) not in processed_articles: similarity = calculate_cosine_similarity(embedding1, embedding2) if similarity > threshold: processed_articles.add((title2, link2)) group.append((title2, link2)) grouped_similar_articles.append(group) return grouped_similar_articles except psycopg2.Error as e: print(f"Error: {e}") return [] def processing_similar(): grouped_similar_articles_result = find_and_group_similar_articles() if grouped_similar_articles_result: for group in grouped_similar_articles_result: articles = [] if len(group) > 1: for article_tuple in group: if len(article_tuple) >= 2: title, link = article_tuple[:2] article = [title, link] articles.append(article) l = len(articles) if l == 2: print("2") a_one = articles[0][0] a_two = articles[1][0] get_one = get_specific_data(a_one) get_two = get_specific_data(a_two) text1 = get_one[0][1] text2 = get_two[0][1] link1 = get_one[0][2] link2 = get_two[0][2] if link1 != link2: link = f"{link1}, {link2}" else: link = link1 ftoks = num_tokens_from_string(text1) stoks = num_tokens_from_string(text2) tokens = ftoks + stoks similar_d = f"C: {a_one}, {a_two}" modify_similar_data(similar_d, a_one) preparing_articles(False, a_one) modify_similar_data(similar_d, a_two) preparing_articles(False, a_two) print(tokens) if tokens > 2000: combined_text = f"{text1} {text2}" combined_text = slice_text_at_2k_tokens(combined_text) user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field" else: user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." if l == 3: print("3") a_one = articles[0][0] a_two = articles[1][0] a_three = articles[2][0] get_one = get_specific_data(a_one) get_two = get_specific_data(a_two) get_three = get_specific_data(a_three) text1 = get_one[0][1] text2 = get_two[0][1] text3 = get_three[0][1] link1 = get_one[0][2] link2 = get_two[0][2] link3 = get_three[0][2] if link1 != link2: if link2 != link3: link = f"{link1}, {link2}, {link3}" else: link = f"{link1}, {link2}" else: if link2 != link3: link = f"{link1}, {link3}" else: link = link1 ftoks = num_tokens_from_string(text1) stoks = num_tokens_from_string(text2) ttoks = num_tokens_from_string(text3) tokens = ftoks + stoks + ttoks similar_d = f"C: {a_one}, {a_two}, {a_three}" modify_similar_data(similar_d, a_one) preparing_articles(False, a_one) modify_similar_data(similar_d, a_two) preparing_articles(False, a_two) modify_similar_data(similar_d, a_three) preparing_articles(False, a_three) print(tokens) if tokens > 2000: combined_text = f"{text1} {text2} {text3}" combined_text = slice_text_at_2k_tokens(combined_text) user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field" else: user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." try: completion = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "Data analytic, Journalist and News reporter"}, {"role": "user", "content": user_message} ]) generated_text = completion.choices[0].message.content response_data = json.loads(generated_text) title = a_one text = response_data["content"] vector = embeddings.embed_query(generated_text) insert_data(title, text, link, vector, similar_d) print(f"Inserting combined: {title}") except Exception as e: print(f"Error: {e}") print(f"Title: {a_one}") print(f"Answer: {generated_text}") continue else: print("No similar articles found.") if __name__=="__main__": processing_similar() ready = get_ready_data() if ready: for a in ready: print(f"Title: {a[0]}") print(f"Link: {a[2]}") print(f"Status: {a[3]}")