organizing code

This commit is contained in:
2024-01-07 03:41:32 +01:00
parent 96a2d88895
commit b7a0e5478c
5 changed files with 122 additions and 235 deletions

View File

@@ -1,12 +1,10 @@
import psycopg2
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
import os
from openai import OpenAI , APIError
from openai import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, get_source_data, get_ready_data
import tiktoken
from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings
from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens
import json
@@ -18,80 +16,30 @@ embeddings = OpenAIEmbeddings()
print(f"Checking for similar!")
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
dbname = os.getenv("DB_NAME")
def calculate_cosine_similarity(v1, v2):
v1_normalized = v1 / np.linalg.norm(v1)
v2_normalized = v2 / np.linalg.norm(v2)
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
return similarity
def parse_embedding_string(embedding_str):
if isinstance(embedding_str, str):
numbers = [float(num) for num in embedding_str[1:-1].split(',')]
return np.array(numbers)
elif isinstance(embedding_str, np.ndarray):
return embedding_str
else:
raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.")
def get_titles_links_embeddings():
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
cursor = conn.cursor()
cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;')
data = cursor.fetchall()
cursor.close()
titles = [row[0] for row in data]
links = [row[1] for row in data]
embeddings = [parse_embedding_string(row[2]) for row in data]
return titles, links, embeddings
def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
try:
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
titles, links, embeddings = get_titles_links_embeddings()
with conn, conn.cursor() as cursor:
titles, links, embeddings = get_titles_links_embeddings()
processed_articles = set()
grouped_similar_articles = []
processed_articles = set()
grouped_similar_articles = []
for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
if (title1, link1) not in processed_articles:
processed_articles.add((title1, link1))
group = [(title1, link1)]
for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
if (title1, link1) not in processed_articles:
processed_articles.add((title1, link1))
group = [(title1, link1)]
for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
if i != j and (title2, link2) not in processed_articles:
similarity = calculate_cosine_similarity(embedding1, embedding2)
for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
if i != j and (title2, link2) not in processed_articles:
similarity = calculate_cosine_similarity(embedding1, embedding2)
if similarity > threshold:
processed_articles.add((title2, link2))
group.append((title2, link2))
if similarity > threshold:
processed_articles.add((title2, link2))
group.append((title2, link2))
grouped_similar_articles.append(group)
grouped_similar_articles.append(group)
return grouped_similar_articles
return grouped_similar_articles
except psycopg2.Error as e:
print(f"Error: {e}")
@@ -101,7 +49,6 @@ def processing_similar():
grouped_similar_articles_result = find_and_group_similar_articles()
if grouped_similar_articles_result:
for group in grouped_similar_articles_result:
articles = []
@@ -112,8 +59,8 @@ def processing_similar():
article = [title, link]
articles.append(article)
l = len(articles)
if l == 2:
print("2")
a_one = articles[0][0]
a_two = articles[1][0]
@@ -141,7 +88,6 @@ def processing_similar():
modify_similar_data(similar_d, a_two)
preparing_articles(False, a_two)
print(tokens)
if tokens > 2000:
combined_text = f"{text1} {text2}"
combined_text = slice_text_at_2k_tokens(combined_text)
@@ -150,7 +96,6 @@ def processing_similar():
user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
if l == 3:
print("3")
a_one = articles[0][0]
a_two = articles[1][0]
a_three = articles[2][0]
@@ -190,13 +135,82 @@ def processing_similar():
modify_similar_data(similar_d, a_three)
preparing_articles(False, a_three)
print(tokens)
if tokens > 2000:
combined_text = f"{text1} {text2} {text3}"
combined_text = slice_text_at_2k_tokens(combined_text)
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
else:
user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
if l == 4:
print("4")
a_one = articles[0][0]
a_two = articles[1][0]
a_three = articles[2][0]
a_four = articles[3][0]
get_one = get_specific_data(a_one)
get_two = get_specific_data(a_two)
get_three = get_specific_data(a_three)
get_four = get_specific_data(a_four)
text1 = get_one[0][1]
text2 = get_two[0][1]
text3 = get_three[0][1]
text4 = get_four[0][1]
link1 = get_one[0][2]
link2 = get_two[0][2]
link3 = get_three[0][2]
link4 = get_four[0][2]
if link1 != link2:
if link2 != link3:
if link3 != link4:
link = f"{link1}, {link2}, {link3}, {link4}"
else:
link = f"{link1}, {link2}, {link3}"
else:
if link3 != link4:
link = f"{link1}, {link2}, {link4}"
else:
link = f"{link1}, {link2}"
else:
if link2 != link3:
if link3 != link4:
link = f"{link1}, {link3}, {link4}"
else:
link = f"{link1}, {link3}"
else:
if link3 != link4:
link = f"{link1}, {link4}"
else:
link = link1
ftoks = num_tokens_from_string(text1)
stoks = num_tokens_from_string(text2)
ttoks = num_tokens_from_string(text3)
frtoks = num_tokens_from_string(text4)
tokens = ftoks + stoks + ttoks + frtoks
similar_d = f"C: {a_one}, {a_two}, {a_three}, {a_four}"
modify_similar_data(similar_d, a_one)
preparing_articles(False, a_one)
modify_similar_data(similar_d, a_two)
preparing_articles(False, a_two)
modify_similar_data(similar_d, a_three)
preparing_articles(False, a_three)
modify_similar_data(similar_d, a_four)
preparing_articles(False, a_four)
if tokens > 2000:
combined_text = f"{text1} {text2} {text3} {text4}"
combined_text = slice_text_at_2k_tokens(combined_text)
user_message = rf"Here is text {combined_text}, combined from 4 sources, filter text, and make news content, return as JSON only with a single 'content' field"
else:
user_message = rf"Here are 4 texts {text1} {text2} {text3} and {text4}, combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single 'content' field."
try:
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
@@ -216,16 +230,11 @@ def processing_similar():
except Exception as e:
print(f"Error: {e}")
print(f"Title: {a_one}")
print(f"Answer: {generated_text}")
print(a_one)
continue
else:
print("Done!.")
else:
print("No similar articles found.")
if __name__=="__main__":
processing_similar()
ready = get_ready_data()
if ready:
for a in ready:
print(f"Title: {a[0]}")
print(f"Link: {a[2]}")
print(f"Status: {a[3]}")