diff --git a/pyth/__pycache__/articles.cpython-310.pyc b/pyth/__pycache__/articles.cpython-310.pyc new file mode 100644 index 0000000..40e56eb Binary files /dev/null and b/pyth/__pycache__/articles.cpython-310.pyc differ diff --git a/pyth/__pycache__/scrapingsingle.cpython-310.pyc b/pyth/__pycache__/scrapingsingle.cpython-310.pyc index 38af9db..34597ac 100644 Binary files a/pyth/__pycache__/scrapingsingle.cpython-310.pyc and b/pyth/__pycache__/scrapingsingle.cpython-310.pyc differ diff --git a/pyth/__pycache__/vectData.cpython-310.pyc b/pyth/__pycache__/vectData.cpython-310.pyc index 4104298..ad5b7d5 100644 Binary files a/pyth/__pycache__/vectData.cpython-310.pyc and b/pyth/__pycache__/vectData.cpython-310.pyc differ diff --git a/pyth/articles.py b/pyth/articles.py new file mode 100644 index 0000000..346a917 --- /dev/null +++ b/pyth/articles.py @@ -0,0 +1,231 @@ +import psycopg2 +import numpy as np +from sklearn.metrics.pairwise import cosine_similarity +from dotenv import load_dotenv +import os +from openai import OpenAI , APIError +from langchain.embeddings import OpenAIEmbeddings +from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, get_source_data, get_ready_data +import tiktoken +from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens +import json + +load_dotenv() + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +client = OpenAI() +embeddings = OpenAIEmbeddings() + +print(f"Checking for similar!") + +host = os.getenv("DB_HOST") +port = os.getenv("DB_PORT") +user = os.getenv("DB_USER") +password = os.getenv("DB_PASSWORD") +dbname = os.getenv("DB_NAME") + +def calculate_cosine_similarity(v1, v2): + v1_normalized = v1 / np.linalg.norm(v1) + v2_normalized = v2 / np.linalg.norm(v2) + + similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0] + return similarity + +def parse_embedding_string(embedding_str): + if isinstance(embedding_str, str): + numbers = [float(num) for num in embedding_str[1:-1].split(',')] + return np.array(numbers) + elif isinstance(embedding_str, np.ndarray): + return embedding_str + else: + raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.") + + +def get_titles_links_embeddings(): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;') + data = cursor.fetchall() + cursor.close() + + titles = [row[0] for row in data] + links = [row[1] for row in data] + embeddings = [parse_embedding_string(row[2]) for row in data] + + return titles, links, embeddings + +def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95): + try: + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + + with conn, conn.cursor() as cursor: + titles, links, embeddings = get_titles_links_embeddings() + + processed_articles = set() + grouped_similar_articles = [] + + for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)): + if (title1, link1) not in processed_articles: + processed_articles.add((title1, link1)) + group = [(title1, link1)] + + for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)): + if i != j and (title2, link2) not in processed_articles: + similarity = calculate_cosine_similarity(embedding1, embedding2) + + if similarity > threshold: + processed_articles.add((title2, link2)) + group.append((title2, link2)) + + grouped_similar_articles.append(group) + + return grouped_similar_articles + + except psycopg2.Error as e: + print(f"Error: {e}") + return [] + +def processing_similar(): + grouped_similar_articles_result = find_and_group_similar_articles() + + if grouped_similar_articles_result: + + for group in grouped_similar_articles_result: + articles = [] + + if len(group) > 1: + for article_tuple in group: + if len(article_tuple) >= 2: + title, link = article_tuple[:2] + article = [title, link] + articles.append(article) + l = len(articles) + if l == 2: + print("2") + a_one = articles[0][0] + a_two = articles[1][0] + + get_one = get_specific_data(a_one) + get_two = get_specific_data(a_two) + + text1 = get_one[0][1] + text2 = get_two[0][1] + link1 = get_one[0][2] + link2 = get_two[0][2] + if link1 != link2: + link = f"{link1}, {link2}" + else: + link = link1 + + ftoks = num_tokens_from_string(text1) + stoks = num_tokens_from_string(text2) + tokens = ftoks + stoks + + similar_d = f"C: {a_one}, {a_two}" + + modify_similar_data(similar_d, a_one) + preparing_articles(False, a_one) + + modify_similar_data(similar_d, a_two) + preparing_articles(False, a_two) + + print(tokens) + if tokens > 2000: + combined_text = f"{text1} {text2}" + combined_text = slice_text_at_2k_tokens(combined_text) + user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field" + else: + user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." + + if l == 3: + print("3") + a_one = articles[0][0] + a_two = articles[1][0] + a_three = articles[2][0] + + get_one = get_specific_data(a_one) + get_two = get_specific_data(a_two) + get_three = get_specific_data(a_three) + + text1 = get_one[0][1] + text2 = get_two[0][1] + text3 = get_three[0][1] + link1 = get_one[0][2] + link2 = get_two[0][2] + link3 = get_three[0][2] + if link1 != link2: + if link2 != link3: + link = f"{link1}, {link2}, {link3}" + else: + link = f"{link1}, {link2}" + else: + if link2 != link3: + link = f"{link1}, {link3}" + else: + link = link1 + ftoks = num_tokens_from_string(text1) + stoks = num_tokens_from_string(text2) + ttoks = num_tokens_from_string(text3) + tokens = ftoks + stoks + ttoks + + similar_d = f"C: {a_one}, {a_two}, {a_three}" + modify_similar_data(similar_d, a_one) + preparing_articles(False, a_one) + + modify_similar_data(similar_d, a_two) + preparing_articles(False, a_two) + + modify_similar_data(similar_d, a_three) + preparing_articles(False, a_three) + + print(tokens) + if tokens > 2000: + combined_text = f"{text1} {text2} {text3}" + combined_text = slice_text_at_2k_tokens(combined_text) + user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field" + else: + user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." + try: + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "Data analytic, Journalist and News reporter"}, + {"role": "user", "content": user_message} + ]) + generated_text = completion.choices[0].message.content + + response_data = json.loads(generated_text) + title = a_one + text = response_data["content"] + vector = embeddings.embed_query(generated_text) + + insert_data(title, text, link, vector, similar_d) + print(f"Inserting combined: {title}") + + except Exception as e: + print(f"Error: {e}") + print(f"Title: {a_one}") + print(f"Answer: {generated_text}") + continue + else: + print("No similar articles found.") +if __name__=="__main__": + processing_similar() +ready = get_ready_data() +if ready: + for a in ready: + print(f"Title: {a[0]}") + print(f"Link: {a[2]}") + print(f"Status: {a[3]}") \ No newline at end of file diff --git a/pyth/scrapingsingle.py b/pyth/scrapingsingle.py index 8e65beb..44ff2eb 100644 --- a/pyth/scrapingsingle.py +++ b/pyth/scrapingsingle.py @@ -4,7 +4,7 @@ from urllib.parse import urljoin from openai import OpenAI , APIError import os from langchain.embeddings import OpenAIEmbeddings -from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data, delete_specific,get_all_links,cleansing ,modify_similar_data) +from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data,get_all_links,cleansing ,modify_similar_data) import json from dotenv import load_dotenv import tiktoken @@ -48,6 +48,19 @@ def replace_with_spaces(text): cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text) return cleaned_text + +def fix_links(links_set): + modified_links = set() + + for link in links_set: + if "www" in link: + modified_link = link.replace("www.", "") + modified_links.add(modified_link) + else: + modified_links.add(link) + + return modified_links + total_links = set() collected_news = set() @@ -78,13 +91,13 @@ for dlink in dlinks: total_links.update(temp_links) final_links = {item for item in total_links if item} -i = 0 db_links = set(get_all_links()) new_links = final_links - db_links final_links = new_links +final_links = set(final_links) - +final_links = fix_links(final_links) if __name__ == '__main__': @@ -142,6 +155,7 @@ if __name__ == '__main__': print(f"Error in completion: {e}") continue + def comb_similar(): print("Checking similar") @@ -185,12 +199,17 @@ def comb_similar(): combined_text = f"{text1}{text2}{text3}" combined_text = slice_text_at_2k_tokens(combined_text) user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with 'content' field" - link = f"{link1} {link2} {link3}" + if link1 != link2 and link1 != link3 and link2 != link3: + link = f"{link1} {link2} {link3}" + else: + link = link1 else: user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field." - link = f"{link1} {link2} {link3}" - + if link1 != link2 and link1 != link3 and link2 != link3: + link = f"{link1} {link2} {link3}" + else: + link = link1 else: ftcheck = num_tokens_from_string(f_text) stcheck = num_tokens_from_string(s_text) @@ -198,12 +217,17 @@ def comb_similar(): if fscomb <2000: combined_text = f"{f_text}{s_text}" user_message = rf"Here is text {combined_text}, combined from 2 sources, filter text, and make news content, return as JSON only with 'content' field" - link = f"{link_f} {link_s}" + if link_f != link_s: + link = f"{link_f} {link_s}" + else: + link = link_f else: user_message = rf"Here are 2 texts {f_text} and {s_text}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field." - link = f"{link_f} {link_s}" - + if link_f != link_s: + link = f"{link_f} {link_s}" + else: + link = link_f try: completion = client.chat.completions.create( model="gpt-3.5-turbo", @@ -213,7 +237,6 @@ def comb_similar(): ] ) generated_text = completion.choices[0].message.content - generated_text = generated_text if similar_article: if f_title == s_title: @@ -222,6 +245,7 @@ def comb_similar(): similar_article.remove(sa) print("Modified") else: + print(f"First: {f_title}") print(f"Second: {s_title}") modify_similar_data(first_t,"SOURCE") modify_similar_data(second_t,"SOURCE") @@ -243,5 +267,3 @@ def comb_similar(): except Exception as e: print(f"Error in completion: {e}") continue - -comb_similar() \ No newline at end of file diff --git a/pyth/templates/index.html b/pyth/templates/index.html index 9b156d8..c9e51c1 100644 --- a/pyth/templates/index.html +++ b/pyth/templates/index.html @@ -18,6 +18,5 @@ Second - \ No newline at end of file diff --git a/pyth/vectData.py b/pyth/vectData.py index e99883a..35a642c 100644 --- a/pyth/vectData.py +++ b/pyth/vectData.py @@ -83,6 +83,7 @@ def get_similar(): return similar_data + def insert_data(title, text, link, embedding, similar_d): conn = psycopg2.connect( host=host, @@ -97,9 +98,9 @@ def insert_data(title, text, link, embedding, similar_d): cursor = conn.cursor() cursor.execute(''' - INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time) - VALUES (%s, %s, %s, %s, %s ,%s); - ''', (title, text, link, embedding , similar_d, c_time)) + INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready) + VALUES (%s, %s, %s, %s, %s ,%s ,%s); + ''', (title, text, link, embedding , similar_d, c_time, True)) conn.commit() @@ -121,6 +122,39 @@ def get_data(): cursor.close() return data +def get_ready_data(): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;''' + + cursor.execute(query, ('True',)) + data = cursor.fetchall() + cursor.close() + return data + +def get_source_data(): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;''' + + cursor.execute(query, ('False',)) + data = cursor.fetchall() + cursor.close() + return data + + def modify_similar_data(new_value ,title): conn = psycopg2.connect( @@ -138,6 +172,24 @@ def modify_similar_data(new_value ,title): conn.commit() + +def preparing_articles(new_value ,title): + + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + + query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s ''' + + cursor.execute(query, (new_value, title)) + + conn.commit() + def get_specific_data(title): conn = psycopg2.connect( host=host, @@ -244,7 +296,9 @@ def create_db(conn): link VARCHAR, embedding vector(1536), similar_d VARCHAR, - time TIMESTAMP DEFAULT CURRENT_TIMESTAMP + time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + ready BOOLEAN + ); ''') diff --git a/pyth/web-server.py b/pyth/web-server.py index ae78c2b..ed1dc44 100644 --- a/pyth/web-server.py +++ b/pyth/web-server.py @@ -1,5 +1,5 @@ from flask import Flask , render_template , jsonify -from vectData import get_data +from vectData import get_ready_data from flask_cors import CORS @@ -21,4 +21,9 @@ def articleone(): def articletwo(): return render_template("two.html") +@app.route('/data/get/news', methods=['GET']) +def takenews(): + data = get_ready_data() + return jsonify(data) + app.run(debug=True) \ No newline at end of file