diff --git a/pyth/__pycache__/scrapingsingle.cpython-310.pyc b/pyth/__pycache__/scrapingsingle.cpython-310.pyc index 34597ac..2598ed5 100644 Binary files a/pyth/__pycache__/scrapingsingle.cpython-310.pyc and b/pyth/__pycache__/scrapingsingle.cpython-310.pyc differ diff --git a/pyth/__pycache__/vectData.cpython-310.pyc b/pyth/__pycache__/vectData.cpython-310.pyc index ad5b7d5..9cb85af 100644 Binary files a/pyth/__pycache__/vectData.cpython-310.pyc and b/pyth/__pycache__/vectData.cpython-310.pyc differ diff --git a/pyth/articles.py b/pyth/articles.py index 346a917..56d5c5a 100644 --- a/pyth/articles.py +++ b/pyth/articles.py @@ -1,12 +1,10 @@ import psycopg2 import numpy as np -from sklearn.metrics.pairwise import cosine_similarity from dotenv import load_dotenv import os -from openai import OpenAI , APIError +from openai import OpenAI from langchain.embeddings import OpenAIEmbeddings -from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, get_source_data, get_ready_data -import tiktoken +from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens import json @@ -18,80 +16,30 @@ embeddings = OpenAIEmbeddings() print(f"Checking for similar!") -host = os.getenv("DB_HOST") -port = os.getenv("DB_PORT") -user = os.getenv("DB_USER") -password = os.getenv("DB_PASSWORD") -dbname = os.getenv("DB_NAME") - -def calculate_cosine_similarity(v1, v2): - v1_normalized = v1 / np.linalg.norm(v1) - v2_normalized = v2 / np.linalg.norm(v2) - - similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0] - return similarity - -def parse_embedding_string(embedding_str): - if isinstance(embedding_str, str): - numbers = [float(num) for num in embedding_str[1:-1].split(',')] - return np.array(numbers) - elif isinstance(embedding_str, np.ndarray): - return embedding_str - else: - raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.") - - -def get_titles_links_embeddings(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) - cursor = conn.cursor() - cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;') - data = cursor.fetchall() - cursor.close() - - titles = [row[0] for row in data] - links = [row[1] for row in data] - embeddings = [parse_embedding_string(row[2]) for row in data] - - return titles, links, embeddings def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95): try: - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) + titles, links, embeddings = get_titles_links_embeddings() - with conn, conn.cursor() as cursor: - titles, links, embeddings = get_titles_links_embeddings() + processed_articles = set() + grouped_similar_articles = [] - processed_articles = set() - grouped_similar_articles = [] + for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)): + if (title1, link1) not in processed_articles: + processed_articles.add((title1, link1)) + group = [(title1, link1)] - for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)): - if (title1, link1) not in processed_articles: - processed_articles.add((title1, link1)) - group = [(title1, link1)] + for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)): + if i != j and (title2, link2) not in processed_articles: + similarity = calculate_cosine_similarity(embedding1, embedding2) - for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)): - if i != j and (title2, link2) not in processed_articles: - similarity = calculate_cosine_similarity(embedding1, embedding2) + if similarity > threshold: + processed_articles.add((title2, link2)) + group.append((title2, link2)) - if similarity > threshold: - processed_articles.add((title2, link2)) - group.append((title2, link2)) + grouped_similar_articles.append(group) - grouped_similar_articles.append(group) - - return grouped_similar_articles + return grouped_similar_articles except psycopg2.Error as e: print(f"Error: {e}") @@ -101,7 +49,6 @@ def processing_similar(): grouped_similar_articles_result = find_and_group_similar_articles() if grouped_similar_articles_result: - for group in grouped_similar_articles_result: articles = [] @@ -112,8 +59,8 @@ def processing_similar(): article = [title, link] articles.append(article) l = len(articles) + if l == 2: - print("2") a_one = articles[0][0] a_two = articles[1][0] @@ -141,7 +88,6 @@ def processing_similar(): modify_similar_data(similar_d, a_two) preparing_articles(False, a_two) - print(tokens) if tokens > 2000: combined_text = f"{text1} {text2}" combined_text = slice_text_at_2k_tokens(combined_text) @@ -150,7 +96,6 @@ def processing_similar(): user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." if l == 3: - print("3") a_one = articles[0][0] a_two = articles[1][0] a_three = articles[2][0] @@ -190,13 +135,82 @@ def processing_similar(): modify_similar_data(similar_d, a_three) preparing_articles(False, a_three) - print(tokens) if tokens > 2000: combined_text = f"{text1} {text2} {text3}" combined_text = slice_text_at_2k_tokens(combined_text) user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field" else: user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." + if l == 4: + print("4") + a_one = articles[0][0] + a_two = articles[1][0] + a_three = articles[2][0] + a_four = articles[3][0] + + get_one = get_specific_data(a_one) + get_two = get_specific_data(a_two) + get_three = get_specific_data(a_three) + get_four = get_specific_data(a_four) + + text1 = get_one[0][1] + text2 = get_two[0][1] + text3 = get_three[0][1] + text4 = get_four[0][1] + link1 = get_one[0][2] + link2 = get_two[0][2] + link3 = get_three[0][2] + link4 = get_four[0][2] + + if link1 != link2: + if link2 != link3: + if link3 != link4: + link = f"{link1}, {link2}, {link3}, {link4}" + else: + link = f"{link1}, {link2}, {link3}" + else: + if link3 != link4: + link = f"{link1}, {link2}, {link4}" + else: + link = f"{link1}, {link2}" + else: + if link2 != link3: + if link3 != link4: + link = f"{link1}, {link3}, {link4}" + else: + link = f"{link1}, {link3}" + else: + if link3 != link4: + link = f"{link1}, {link4}" + else: + link = link1 + + ftoks = num_tokens_from_string(text1) + stoks = num_tokens_from_string(text2) + ttoks = num_tokens_from_string(text3) + frtoks = num_tokens_from_string(text4) + + tokens = ftoks + stoks + ttoks + frtoks + + similar_d = f"C: {a_one}, {a_two}, {a_three}, {a_four}" + modify_similar_data(similar_d, a_one) + preparing_articles(False, a_one) + + modify_similar_data(similar_d, a_two) + preparing_articles(False, a_two) + + modify_similar_data(similar_d, a_three) + preparing_articles(False, a_three) + + modify_similar_data(similar_d, a_four) + preparing_articles(False, a_four) + + if tokens > 2000: + combined_text = f"{text1} {text2} {text3} {text4}" + combined_text = slice_text_at_2k_tokens(combined_text) + user_message = rf"Here is text {combined_text}, combined from 4 sources, filter text, and make news content, return as JSON only with a single 'content' field" + else: + user_message = rf"Here are 4 texts {text1} {text2} {text3} and {text4}, combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single 'content' field." try: completion = client.chat.completions.create( model="gpt-3.5-turbo", @@ -216,16 +230,11 @@ def processing_similar(): except Exception as e: print(f"Error: {e}") - print(f"Title: {a_one}") - print(f"Answer: {generated_text}") + print(a_one) continue + else: + print("Done!.") else: print("No similar articles found.") if __name__=="__main__": processing_similar() -ready = get_ready_data() -if ready: - for a in ready: - print(f"Title: {a[0]}") - print(f"Link: {a[2]}") - print(f"Status: {a[3]}") \ No newline at end of file diff --git a/pyth/scrapingsingle.py b/pyth/scrapingsingle.py index ac86b52..e939adb 100644 --- a/pyth/scrapingsingle.py +++ b/pyth/scrapingsingle.py @@ -1,10 +1,10 @@ from bs4 import BeautifulSoup import requests from urllib.parse import urljoin -from openai import OpenAI , APIError +from openai import OpenAI import os from langchain.embeddings import OpenAIEmbeddings -from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data,get_all_links,cleansing ,modify_similar_data) +from vectData import (insert_data ,is_similar_data ,get_all_links,cleansing ) import json from dotenv import load_dotenv import tiktoken @@ -39,7 +39,7 @@ def slice_text_at_2k_tokens(text): sliced_tokens = tokens[:max_tokens] sliced_text = encoding.decode(sliced_tokens) - + return sliced_text @@ -82,7 +82,6 @@ def get_article_links(url, already_checked): return link_store - already_checked = set() for dlink in dlinks: @@ -116,8 +115,6 @@ if __name__ == '__main__': title_text = replace_with_spaces(title_text) - - print(f"Tokens usage: {num_tokens_from_string(text_text, 'gpt-3.5-turbo')}") text_text = slice_text_at_2k_tokens(text_text) text_text = replace_with_spaces(str(text_text)) @@ -138,13 +135,6 @@ if __name__ == '__main__': title = response_data["title"] text = response_data["content"] - #print("*********************************") - #print(f"Title: {title}") - #print("---------------------------------") - #print(f"Content : {text}") - #print("*********************************") - - vector = embeddings.embed_query(generated_text) if not is_similar_data(title, text, link, vector, threshold=0.98): diff --git a/pyth/vectData.py b/pyth/vectData.py index 35a642c..e3deda7 100644 --- a/pyth/vectData.py +++ b/pyth/vectData.py @@ -7,7 +7,6 @@ import os from dotenv import load_dotenv from datetime import datetime ,timedelta - load_dotenv() host = os.getenv("DB_HOST") @@ -27,20 +26,20 @@ conn = psycopg2.connect( def calculate_cosine_similarity(v1, v2): v1_normalized = v1 / np.linalg.norm(v1) v2_normalized = v2 / np.linalg.norm(v2) - similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0] return similarity -def is_similar_data(title, text, link, embedding, threshold=0.98): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) - cursor = conn.cursor() +def parse_embedding_string(embedding_str): + if isinstance(embedding_str, str): + numbers = [float(num) for num in embedding_str[1:-1].split(',')] + return np.array(numbers) + elif isinstance(embedding_str, np.ndarray): + return embedding_str + else: + raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.") +def is_similar_data(title, text, link, embedding, threshold=0.98): + cursor = conn.cursor() cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;') existing_embeddings = cursor.fetchall() @@ -54,12 +53,12 @@ def is_similar_data(title, text, link, embedding, threshold=0.98): similar_d = existing_title insert_data(title,text,link,embedding,similar_d) print(f"Similar data found: \n #{title} \n #{existing_title}") - print(f"Inserting: #{title} \n") + print(f"Inserting: #{title}") similar_d = "NO" cursor.close() return True else: - print(f"Same source of same article!") + print(f"Same article of same source!") cursor.close() return True @@ -68,13 +67,6 @@ def is_similar_data(title, text, link, embedding, threshold=0.98): return False def get_similar(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')''' cursor.execute(query) @@ -82,73 +74,49 @@ def get_similar(): cursor.close() return similar_data +def get_titles_links_embeddings(): + cursor = conn.cursor() + cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;') + data = cursor.fetchall() + cursor.close() + + titles = [row[0] for row in data] + links = [row[1] for row in data] + embeddings = [parse_embedding_string(row[2]) for row in data] + + return titles, links, embeddings def insert_data(title, text, link, embedding, similar_d): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) c_time = datetime.now() - - cursor = conn.cursor() - cursor.execute(''' INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready) VALUES (%s, %s, %s, %s, %s ,%s ,%s); ''', (title, text, link, embedding , similar_d, c_time, True)) - conn.commit() - cursor.close() def get_data(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) + cursor = conn.cursor() query = '''SELECT title,text,link FROM vectorsvevijesti;''' - cursor.execute(query) data = cursor.fetchall() cursor.close() return data def get_ready_data(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;''' - cursor.execute(query, ('True',)) data = cursor.fetchall() cursor.close() return data def get_source_data(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;''' - cursor.execute(query, ('False',)) data = cursor.fetchall() cursor.close() @@ -156,138 +124,60 @@ def get_source_data(): def modify_similar_data(new_value ,title): - - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() - query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s ''' - cursor.execute(query, (new_value, title)) - conn.commit() def preparing_articles(new_value ,title): - - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() - query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s ''' - cursor.execute(query, (new_value, title)) - conn.commit() def get_specific_data(title): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() - query = '''SELECT title, text, link, similar_d, embedding FROM vectorsvevijesti WHERE title = %s''' + query = '''SELECT title, text, link, similar_d, embedding, ready FROM vectorsvevijesti WHERE title = %s''' cursor.execute(query, (title,)) - specific_post = cursor.fetchall() cursor.close() return specific_post + def get_all_links(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) cursor = conn.cursor() query = '''SELECT link FROM vectorsvevijesti''' cursor.execute(query) - db_links = {link[0] for link in cursor.fetchall()} cursor.close() return db_links def delete_specific(title): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) - cursor = conn.cursor() query = '''DELETE FROM vectorsvevijesti WHERE title = %s''' - cursor.execute(query,(title,)) cursor.close() def cleansing(): - - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) - day_long = datetime.now() - timedelta(days=1) - cursor = conn.cursor() - query = '''DELETE FROM vectorsvevijesti WHERE time < %s''' cursor.execute(query,(day_long,)) - conn.commit() cursor.close() def drop_table(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) - cursor = conn.cursor() - query = '''DROP TABLE IF EXISTS vectorsvevijesti;''' cursor.execute(query) - conn.commit() cursor.close() -def create_db(conn): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) +def create_db(): cursor = conn.cursor() - cursor.execute("CREATE EXTENSION IF NOT EXISTS vector") - register_vector(conn) - cursor.execute(''' CREATE TABLE IF NOT EXISTS vectorsvevijesti ( id bigserial PRIMARY KEY, @@ -298,10 +188,8 @@ def create_db(conn): similar_d VARCHAR, time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, ready BOOLEAN - ); ''') - conn.commit() cursor.close() -create_db(conn) +create_db()