old-svevijesti/pyth/articles.py

import psycopg2
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
import os
from openai import OpenAI , APIError
from langchain.embeddings import OpenAIEmbeddings
from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, get_source_data, get_ready_data
import tiktoken
from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens
import json

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI()
embeddings = OpenAIEmbeddings()

print(f"Checking for similar!")

host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
dbname = os.getenv("DB_NAME")

def calculate_cosine_similarity(v1, v2):
    v1_normalized = v1 / np.linalg.norm(v1)
    v2_normalized = v2 / np.linalg.norm(v2)

    similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
    return similarity

def parse_embedding_string(embedding_str):
    if isinstance(embedding_str, str):
        numbers = [float(num) for num in embedding_str[1:-1].split(',')]
        return np.array(numbers)
    elif isinstance(embedding_str, np.ndarray):
        return embedding_str
    else:
        raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.")


def get_titles_links_embeddings():
    conn = psycopg2.connect(
        host=host,
        port=port,
        user=user,
        password=password,
        dbname=dbname
    )
    cursor = conn.cursor()
    cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;')
    data = cursor.fetchall()
    cursor.close()

    titles = [row[0] for row in data]
    links = [row[1] for row in data]
    embeddings = [parse_embedding_string(row[2]) for row in data]

    return titles, links, embeddings

def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
    try:
        conn = psycopg2.connect(
            host=host,
            port=port,
            user=user,
            password=password,
            dbname=dbname
        )

        with conn, conn.cursor() as cursor:
            titles, links, embeddings = get_titles_links_embeddings()

            processed_articles = set()
            grouped_similar_articles = []

            for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
                if (title1, link1) not in processed_articles:
                    processed_articles.add((title1, link1))
                    group = [(title1, link1)]

                    for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
                        if i != j and (title2, link2) not in processed_articles:
                            similarity = calculate_cosine_similarity(embedding1, embedding2)

                            if similarity > threshold:
                                processed_articles.add((title2, link2))
                                group.append((title2, link2))

                    grouped_similar_articles.append(group)

            return grouped_similar_articles

    except psycopg2.Error as e:
        print(f"Error: {e}")
        return []

def processing_similar():
        grouped_similar_articles_result = find_and_group_similar_articles()

        if grouped_similar_articles_result:

            for group in grouped_similar_articles_result:
                articles = []

                if len(group) > 1:
                    for article_tuple in group:
                        if len(article_tuple) >= 2:
                            title, link = article_tuple[:2]
                            article = [title, link]
                            articles.append(article)
                    l = len(articles)
                    if l == 2:
                        print("2")
                        a_one = articles[0][0]
                        a_two = articles[1][0]

                        get_one = get_specific_data(a_one)
                        get_two = get_specific_data(a_two)

                        text1 = get_one[0][1]
                        text2 = get_two[0][1]
                        link1 = get_one[0][2]
                        link2 = get_two[0][2]
                        if link1 != link2:
                            link = f"{link1}, {link2}"
                        else:
                            link = link1

                        ftoks = num_tokens_from_string(text1)
                        stoks = num_tokens_from_string(text2)
                        tokens = ftoks + stoks

                        similar_d = f"C: {a_one}, {a_two}"

                        modify_similar_data(similar_d, a_one)
                        preparing_articles(False, a_one)

                        modify_similar_data(similar_d, a_two)
                        preparing_articles(False, a_two)

                        print(tokens)
                        if tokens > 2000:
                            combined_text = f"{text1} {text2}"
                            combined_text = slice_text_at_2k_tokens(combined_text)
                            user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
                        else:
                            user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."

                    if l == 3:
                        print("3")
                        a_one = articles[0][0]
                        a_two = articles[1][0]
                        a_three = articles[2][0]

                        get_one = get_specific_data(a_one)
                        get_two = get_specific_data(a_two)
                        get_three = get_specific_data(a_three)

                        text1 = get_one[0][1]
                        text2 = get_two[0][1]
                        text3 = get_three[0][1]
                        link1 = get_one[0][2]
                        link2 = get_two[0][2]
                        link3 = get_three[0][2]
                        if link1 != link2:
                            if link2 != link3:
                                link = f"{link1}, {link2}, {link3}"
                            else:
                                link = f"{link1}, {link2}"
                        else:
                            if link2 != link3:
                                link = f"{link1}, {link3}"
                            else:
                                link = link1
                        ftoks = num_tokens_from_string(text1)
                        stoks = num_tokens_from_string(text2)
                        ttoks = num_tokens_from_string(text3)
                        tokens = ftoks + stoks + ttoks

                        similar_d = f"C: {a_one}, {a_two}, {a_three}"
                        modify_similar_data(similar_d, a_one)
                        preparing_articles(False, a_one)

                        modify_similar_data(similar_d, a_two)
                        preparing_articles(False, a_two)

                        modify_similar_data(similar_d, a_three)
                        preparing_articles(False, a_three)

                        print(tokens)
                        if tokens > 2000:
                            combined_text = f"{text1} {text2} {text3}"
                            combined_text = slice_text_at_2k_tokens(combined_text)
                            user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
                        else:
                            user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
                    try:
                        completion = client.chat.completions.create(
                            model="gpt-3.5-turbo",
                            messages=[
                                {"role": "system", "content": "Data analytic, Journalist and News reporter"},
                                {"role": "user", "content": user_message}
                            ])
                        generated_text = completion.choices[0].message.content

                        response_data = json.loads(generated_text)
                        title = a_one
                        text = response_data["content"]
                        vector = embeddings.embed_query(generated_text)

                        insert_data(title, text, link, vector, similar_d)
                        print(f"Inserting combined: {title}")

                    except Exception as e:
                        print(f"Error: {e}")
                        print(f"Title: {a_one}")
                        print(f"Answer: {generated_text}")
                        continue
        else:
            print("No similar articles found.")
if __name__=="__main__":
    processing_similar()
ready = get_ready_data()
if ready:
    for a in ready:
        print(f"Title: {a[0]}")
        print(f"Link: {a[2]}")
        print(f"Status: {a[3]}")