2024-01-06 08:17:05 +01:00
import psycopg2
import numpy as np
from dotenv import load_dotenv
import os
2024-01-07 03:41:32 +01:00
from openai import OpenAI
2024-01-06 08:17:05 +01:00
from langchain . embeddings import OpenAIEmbeddings
2024-01-07 03:41:32 +01:00
from vectData import get_specific_data , modify_similar_data , insert_data , preparing_articles , calculate_cosine_similarity , get_titles_links_embeddings
2024-01-06 08:17:05 +01:00
from scrapingsingle import num_tokens_from_string , slice_text_at_2k_tokens
import json
2024-01-08 00:28:20 +01:00
from json_repair import repair_json
2024-01-06 08:17:05 +01:00
load_dotenv ( )
OPENAI_API_KEY = os . getenv ( " OPENAI_API_KEY " )
client = OpenAI ( )
embeddings = OpenAIEmbeddings ( )
print ( f " Checking for similar! " )
def find_and_group_similar_articles ( eps = 0.5 , min_samples = 2 , threshold = 0.95 ) :
try :
2024-01-07 03:41:32 +01:00
titles , links , embeddings = get_titles_links_embeddings ( )
2024-01-06 08:17:05 +01:00
2024-01-07 03:41:32 +01:00
processed_articles = set ( )
grouped_similar_articles = [ ]
2024-01-06 08:17:05 +01:00
2024-01-07 03:41:32 +01:00
for i , ( title1 , link1 , embedding1 ) in enumerate ( zip ( titles , links , embeddings ) ) :
if ( title1 , link1 ) not in processed_articles :
processed_articles . add ( ( title1 , link1 ) )
group = [ ( title1 , link1 ) ]
2024-01-06 08:17:05 +01:00
2024-01-07 03:41:32 +01:00
for j , ( title2 , link2 , embedding2 ) in enumerate ( zip ( titles , links , embeddings ) ) :
if i != j and ( title2 , link2 ) not in processed_articles :
similarity = calculate_cosine_similarity ( embedding1 , embedding2 )
2024-01-06 08:17:05 +01:00
2024-01-07 03:41:32 +01:00
if similarity > threshold :
processed_articles . add ( ( title2 , link2 ) )
group . append ( ( title2 , link2 ) )
2024-01-06 08:17:05 +01:00
2024-01-07 03:41:32 +01:00
grouped_similar_articles . append ( group )
2024-01-06 08:17:05 +01:00
2024-01-07 03:41:32 +01:00
return grouped_similar_articles
2024-01-06 08:17:05 +01:00
except psycopg2 . Error as e :
print ( f " Error: { e } " )
return [ ]
def processing_similar ( ) :
grouped_similar_articles_result = find_and_group_similar_articles ( )
if grouped_similar_articles_result :
for group in grouped_similar_articles_result :
articles = [ ]
if len ( group ) > 1 :
for article_tuple in group :
if len ( article_tuple ) > = 2 :
title , link = article_tuple [ : 2 ]
article = [ title , link ]
articles . append ( article )
l = len ( articles )
2024-01-07 03:41:32 +01:00
2024-01-06 08:17:05 +01:00
if l == 2 :
a_one = articles [ 0 ] [ 0 ]
a_two = articles [ 1 ] [ 0 ]
get_one = get_specific_data ( a_one )
get_two = get_specific_data ( a_two )
text1 = get_one [ 0 ] [ 1 ]
text2 = get_two [ 0 ] [ 1 ]
link1 = get_one [ 0 ] [ 2 ]
link2 = get_two [ 0 ] [ 2 ]
if link1 != link2 :
link = f " { link1 } , { link2 } "
else :
link = link1
ftoks = num_tokens_from_string ( text1 )
stoks = num_tokens_from_string ( text2 )
tokens = ftoks + stoks
similar_d = f " C: { a_one } , { a_two } "
modify_similar_data ( similar_d , a_one )
preparing_articles ( False , a_one )
modify_similar_data ( similar_d , a_two )
preparing_articles ( False , a_two )
if tokens > 2000 :
combined_text = f " { text1 } { text2 } "
combined_text = slice_text_at_2k_tokens ( combined_text )
user_message = rf " Here is text { combined_text } , combined from 3 sources, filter text, and make news content, return as JSON only with single ' content ' field "
else :
user_message = rf " Here are 2 texts { text1 } { text2 } , combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single ' content ' field. "
if l == 3 :
a_one = articles [ 0 ] [ 0 ]
a_two = articles [ 1 ] [ 0 ]
a_three = articles [ 2 ] [ 0 ]
get_one = get_specific_data ( a_one )
get_two = get_specific_data ( a_two )
get_three = get_specific_data ( a_three )
text1 = get_one [ 0 ] [ 1 ]
text2 = get_two [ 0 ] [ 1 ]
text3 = get_three [ 0 ] [ 1 ]
link1 = get_one [ 0 ] [ 2 ]
link2 = get_two [ 0 ] [ 2 ]
link3 = get_three [ 0 ] [ 2 ]
if link1 != link2 :
if link2 != link3 :
link = f " { link1 } , { link2 } , { link3 } "
else :
link = f " { link1 } , { link2 } "
else :
if link2 != link3 :
link = f " { link1 } , { link3 } "
else :
link = link1
ftoks = num_tokens_from_string ( text1 )
stoks = num_tokens_from_string ( text2 )
ttoks = num_tokens_from_string ( text3 )
tokens = ftoks + stoks + ttoks
similar_d = f " C: { a_one } , { a_two } , { a_three } "
modify_similar_data ( similar_d , a_one )
preparing_articles ( False , a_one )
modify_similar_data ( similar_d , a_two )
preparing_articles ( False , a_two )
modify_similar_data ( similar_d , a_three )
preparing_articles ( False , a_three )
if tokens > 2000 :
combined_text = f " { text1 } { text2 } { text3 } "
combined_text = slice_text_at_2k_tokens ( combined_text )
user_message = rf " Here is text { combined_text } , combined from 3 sources, filter text, and make news content, return as JSON only with single ' content ' field "
else :
user_message = rf " Here are 3 texts { text1 } { text2 } and { text3 } , combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single ' content ' field. "
2024-01-07 03:41:32 +01:00
if l == 4 :
a_one = articles [ 0 ] [ 0 ]
a_two = articles [ 1 ] [ 0 ]
a_three = articles [ 2 ] [ 0 ]
a_four = articles [ 3 ] [ 0 ]
get_one = get_specific_data ( a_one )
get_two = get_specific_data ( a_two )
get_three = get_specific_data ( a_three )
get_four = get_specific_data ( a_four )
text1 = get_one [ 0 ] [ 1 ]
text2 = get_two [ 0 ] [ 1 ]
text3 = get_three [ 0 ] [ 1 ]
text4 = get_four [ 0 ] [ 1 ]
link1 = get_one [ 0 ] [ 2 ]
link2 = get_two [ 0 ] [ 2 ]
link3 = get_three [ 0 ] [ 2 ]
link4 = get_four [ 0 ] [ 2 ]
if link1 != link2 :
if link2 != link3 :
if link3 != link4 :
link = f " { link1 } , { link2 } , { link3 } , { link4 } "
else :
link = f " { link1 } , { link2 } , { link3 } "
else :
if link3 != link4 :
link = f " { link1 } , { link2 } , { link4 } "
else :
link = f " { link1 } , { link2 } "
else :
if link2 != link3 :
if link3 != link4 :
link = f " { link1 } , { link3 } , { link4 } "
else :
link = f " { link1 } , { link3 } "
else :
if link3 != link4 :
link = f " { link1 } , { link4 } "
else :
link = link1
ftoks = num_tokens_from_string ( text1 )
stoks = num_tokens_from_string ( text2 )
ttoks = num_tokens_from_string ( text3 )
frtoks = num_tokens_from_string ( text4 )
tokens = ftoks + stoks + ttoks + frtoks
similar_d = f " C: { a_one } , { a_two } , { a_three } , { a_four } "
modify_similar_data ( similar_d , a_one )
preparing_articles ( False , a_one )
modify_similar_data ( similar_d , a_two )
preparing_articles ( False , a_two )
modify_similar_data ( similar_d , a_three )
preparing_articles ( False , a_three )
modify_similar_data ( similar_d , a_four )
preparing_articles ( False , a_four )
if tokens > 2000 :
combined_text = f " { text1 } { text2 } { text3 } { text4 } "
combined_text = slice_text_at_2k_tokens ( combined_text )
user_message = rf " Here is text { combined_text } , combined from 4 sources, filter text, and make news content, return as JSON only with a single ' content ' field "
else :
user_message = rf " Here are 4 texts { text1 } { text2 } { text3 } and { text4 } , combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single ' content ' field. "
2024-01-06 08:17:05 +01:00
try :
completion = client . chat . completions . create (
model = " gpt-3.5-turbo " ,
messages = [
{ " role " : " system " , " content " : " Data analytic, Journalist and News reporter " } ,
{ " role " : " user " , " content " : user_message }
] )
generated_text = completion . choices [ 0 ] . message . content
2024-01-08 00:28:20 +01:00
generated_text = repair_json ( generated_text )
2024-01-06 08:17:05 +01:00
response_data = json . loads ( generated_text )
title = a_one
text = response_data [ " content " ]
vector = embeddings . embed_query ( generated_text )
insert_data ( title , text , link , vector , similar_d )
print ( f " Inserting combined: { title } " )
except Exception as e :
print ( f " Error: { e } " )
2024-01-07 03:41:32 +01:00
print ( a_one )
2024-01-06 08:17:05 +01:00
continue
2024-01-07 03:41:32 +01:00
else :
print ( " Done!. " )
2024-01-06 08:17:05 +01:00
else :
print ( " No similar articles found. " )
if __name__ == " __main__ " :
processing_similar ( )