242 lines
11 KiB
Python
242 lines
11 KiB
Python
import psycopg2
|
|
import numpy as np
|
|
from dotenv import load_dotenv
|
|
import os
|
|
from openai import OpenAI
|
|
from langchain.embeddings import OpenAIEmbeddings
|
|
from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings
|
|
from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens
|
|
import json
|
|
from json_repair import repair_json
|
|
|
|
load_dotenv()
|
|
|
|
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
client = OpenAI()
|
|
embeddings = OpenAIEmbeddings()
|
|
|
|
print(f"Checking for similar!")
|
|
|
|
def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
|
|
try:
|
|
titles, links, embeddings = get_titles_links_embeddings()
|
|
|
|
processed_articles = set()
|
|
grouped_similar_articles = []
|
|
|
|
for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
|
|
if (title1, link1) not in processed_articles:
|
|
processed_articles.add((title1, link1))
|
|
group = [(title1, link1)]
|
|
|
|
for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
|
|
if i != j and (title2, link2) not in processed_articles:
|
|
similarity = calculate_cosine_similarity(embedding1, embedding2)
|
|
|
|
if similarity > threshold:
|
|
processed_articles.add((title2, link2))
|
|
group.append((title2, link2))
|
|
|
|
grouped_similar_articles.append(group)
|
|
|
|
return grouped_similar_articles
|
|
|
|
except psycopg2.Error as e:
|
|
print(f"Error: {e}")
|
|
return []
|
|
|
|
def processing_similar():
|
|
grouped_similar_articles_result = find_and_group_similar_articles()
|
|
|
|
if grouped_similar_articles_result:
|
|
for group in grouped_similar_articles_result:
|
|
articles = []
|
|
|
|
if len(group) > 1:
|
|
for article_tuple in group:
|
|
if len(article_tuple) >= 2:
|
|
title, link = article_tuple[:2]
|
|
article = [title, link]
|
|
articles.append(article)
|
|
l = len(articles)
|
|
|
|
if l == 2:
|
|
a_one = articles[0][0]
|
|
a_two = articles[1][0]
|
|
|
|
get_one = get_specific_data(a_one)
|
|
get_two = get_specific_data(a_two)
|
|
|
|
text1 = get_one[0][1]
|
|
text2 = get_two[0][1]
|
|
link1 = get_one[0][2]
|
|
link2 = get_two[0][2]
|
|
if link1 != link2:
|
|
link = f"{link1}, {link2}"
|
|
else:
|
|
link = link1
|
|
|
|
ftoks = num_tokens_from_string(text1)
|
|
stoks = num_tokens_from_string(text2)
|
|
tokens = ftoks + stoks
|
|
|
|
similar_d = f"C: {a_one}, {a_two}"
|
|
|
|
modify_similar_data(similar_d, a_one)
|
|
preparing_articles(False, a_one)
|
|
|
|
modify_similar_data(similar_d, a_two)
|
|
preparing_articles(False, a_two)
|
|
|
|
if tokens > 2000:
|
|
combined_text = f"{text1} {text2}"
|
|
combined_text = slice_text_at_2k_tokens(combined_text)
|
|
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
|
|
else:
|
|
user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
|
|
|
|
if l == 3:
|
|
a_one = articles[0][0]
|
|
a_two = articles[1][0]
|
|
a_three = articles[2][0]
|
|
|
|
get_one = get_specific_data(a_one)
|
|
get_two = get_specific_data(a_two)
|
|
get_three = get_specific_data(a_three)
|
|
|
|
text1 = get_one[0][1]
|
|
text2 = get_two[0][1]
|
|
text3 = get_three[0][1]
|
|
link1 = get_one[0][2]
|
|
link2 = get_two[0][2]
|
|
link3 = get_three[0][2]
|
|
if link1 != link2:
|
|
if link2 != link3:
|
|
link = f"{link1}, {link2}, {link3}"
|
|
else:
|
|
link = f"{link1}, {link2}"
|
|
else:
|
|
if link2 != link3:
|
|
link = f"{link1}, {link3}"
|
|
else:
|
|
link = link1
|
|
ftoks = num_tokens_from_string(text1)
|
|
stoks = num_tokens_from_string(text2)
|
|
ttoks = num_tokens_from_string(text3)
|
|
tokens = ftoks + stoks + ttoks
|
|
|
|
similar_d = f"C: {a_one}, {a_two}, {a_three}"
|
|
modify_similar_data(similar_d, a_one)
|
|
preparing_articles(False, a_one)
|
|
|
|
modify_similar_data(similar_d, a_two)
|
|
preparing_articles(False, a_two)
|
|
|
|
modify_similar_data(similar_d, a_three)
|
|
preparing_articles(False, a_three)
|
|
|
|
if tokens > 2000:
|
|
combined_text = f"{text1} {text2} {text3}"
|
|
combined_text = slice_text_at_2k_tokens(combined_text)
|
|
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
|
|
else:
|
|
user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
|
|
if l == 4:
|
|
a_one = articles[0][0]
|
|
a_two = articles[1][0]
|
|
a_three = articles[2][0]
|
|
a_four = articles[3][0]
|
|
|
|
get_one = get_specific_data(a_one)
|
|
get_two = get_specific_data(a_two)
|
|
get_three = get_specific_data(a_three)
|
|
get_four = get_specific_data(a_four)
|
|
|
|
text1 = get_one[0][1]
|
|
text2 = get_two[0][1]
|
|
text3 = get_three[0][1]
|
|
text4 = get_four[0][1]
|
|
link1 = get_one[0][2]
|
|
link2 = get_two[0][2]
|
|
link3 = get_three[0][2]
|
|
link4 = get_four[0][2]
|
|
|
|
if link1 != link2:
|
|
if link2 != link3:
|
|
if link3 != link4:
|
|
link = f"{link1}, {link2}, {link3}, {link4}"
|
|
else:
|
|
link = f"{link1}, {link2}, {link3}"
|
|
else:
|
|
if link3 != link4:
|
|
link = f"{link1}, {link2}, {link4}"
|
|
else:
|
|
link = f"{link1}, {link2}"
|
|
else:
|
|
if link2 != link3:
|
|
if link3 != link4:
|
|
link = f"{link1}, {link3}, {link4}"
|
|
else:
|
|
link = f"{link1}, {link3}"
|
|
else:
|
|
if link3 != link4:
|
|
link = f"{link1}, {link4}"
|
|
else:
|
|
link = link1
|
|
|
|
ftoks = num_tokens_from_string(text1)
|
|
stoks = num_tokens_from_string(text2)
|
|
ttoks = num_tokens_from_string(text3)
|
|
frtoks = num_tokens_from_string(text4)
|
|
|
|
tokens = ftoks + stoks + ttoks + frtoks
|
|
|
|
similar_d = f"C: {a_one}, {a_two}, {a_three}, {a_four}"
|
|
modify_similar_data(similar_d, a_one)
|
|
preparing_articles(False, a_one)
|
|
|
|
modify_similar_data(similar_d, a_two)
|
|
preparing_articles(False, a_two)
|
|
|
|
modify_similar_data(similar_d, a_three)
|
|
preparing_articles(False, a_three)
|
|
|
|
modify_similar_data(similar_d, a_four)
|
|
preparing_articles(False, a_four)
|
|
|
|
if tokens > 2000:
|
|
combined_text = f"{text1} {text2} {text3} {text4}"
|
|
combined_text = slice_text_at_2k_tokens(combined_text)
|
|
user_message = rf"Here is text {combined_text}, combined from 4 sources, filter text, and make news content, return as JSON only with a single 'content' field"
|
|
else:
|
|
user_message = rf"Here are 4 texts {text1} {text2} {text3} and {text4}, combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single 'content' field."
|
|
try:
|
|
completion = client.chat.completions.create(
|
|
model="gpt-3.5-turbo",
|
|
messages=[
|
|
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
|
{"role": "user", "content": user_message}
|
|
])
|
|
generated_text = completion.choices[0].message.content
|
|
|
|
generated_text = repair_json(generated_text)
|
|
|
|
response_data = json.loads(generated_text)
|
|
title = a_one
|
|
text = response_data["content"]
|
|
vector = embeddings.embed_query(generated_text)
|
|
|
|
insert_data(title, text, link, vector, similar_d)
|
|
print(f"Inserting combined: {title}")
|
|
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
print(a_one)
|
|
continue
|
|
else:
|
|
print("Done!.")
|
|
else:
|
|
print("No similar articles found.")
|
|
if __name__=="__main__":
|
|
processing_similar()
|