organizing code

2024-01-07 03:41:32 +01:00
parent 96a2d88895
commit b7a0e5478c
5 changed files with 122 additions and 235 deletions
--- a/pyth/pycache/scrapingsingle.cpython-310.pyc
+++ b/pyth/pycache/scrapingsingle.cpython-310.pyc
--- a/pyth/pycache/vectData.cpython-310.pyc
+++ b/pyth/pycache/vectData.cpython-310.pyc
--- a/pyth/articles.py
+++ b/pyth/articles.py
@@ -1,12 +1,10 @@
 import psycopg2
 import numpy as np
-from sklearn.metrics.pairwise import cosine_similarity
 from dotenv import load_dotenv
 import os
-from openai import OpenAI , APIError 
+from openai import OpenAI
 from langchain.embeddings import OpenAIEmbeddings
-from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, get_source_data, get_ready_data
-import tiktoken
+from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings
 from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens
 import json

@@ -18,80 +16,30 @@ embeddings = OpenAIEmbeddings()

 print(f"Checking for similar!")

-host = os.getenv("DB_HOST")
-port = os.getenv("DB_PORT")
-user = os.getenv("DB_USER")
-password = os.getenv("DB_PASSWORD")
-dbname = os.getenv("DB_NAME")
-
-def calculate_cosine_similarity(v1, v2):
-    v1_normalized = v1 / np.linalg.norm(v1)
-    v2_normalized = v2 / np.linalg.norm(v2)
-
-    similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
-    return similarity
-
-def parse_embedding_string(embedding_str):
-    if isinstance(embedding_str, str):
-        numbers = [float(num) for num in embedding_str[1:-1].split(',')]
-        return np.array(numbers)
-    elif isinstance(embedding_str, np.ndarray):
-        return embedding_str
-    else:
-        raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.")
-
-
-def get_titles_links_embeddings():
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        dbname=dbname
-    )
-    cursor = conn.cursor()
-    cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;')
-    data = cursor.fetchall()
-    cursor.close()
-
-    titles = [row[0] for row in data]
-    links = [row[1] for row in data]
-    embeddings = [parse_embedding_string(row[2]) for row in data]
-
-    return titles, links, embeddings

 def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
    try:
-        conn = psycopg2.connect(
-            host=host,
-            port=port,
-            user=user,
-            password=password,
-            dbname=dbname
-        )
+        titles, links, embeddings = get_titles_links_embeddings()

-        with conn, conn.cursor() as cursor:
-            titles, links, embeddings = get_titles_links_embeddings()
+        processed_articles = set()
+        grouped_similar_articles = []

-            processed_articles = set()
-            grouped_similar_articles = []
+        for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
+            if (title1, link1) not in processed_articles:
+                processed_articles.add((title1, link1))
+                group = [(title1, link1)]

-            for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
-                if (title1, link1) not in processed_articles:
-                    processed_articles.add((title1, link1))
-                    group = [(title1, link1)]
+                for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
+                    if i != j and (title2, link2) not in processed_articles:
+                        similarity = calculate_cosine_similarity(embedding1, embedding2)

-                    for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
-                        if i != j and (title2, link2) not in processed_articles:
-                            similarity = calculate_cosine_similarity(embedding1, embedding2)
+                        if similarity > threshold:
+                            processed_articles.add((title2, link2))
+                            group.append((title2, link2))

-                            if similarity > threshold:
-                                processed_articles.add((title2, link2))
-                                group.append((title2, link2))
+                grouped_similar_articles.append(group)

-                    grouped_similar_articles.append(group)
-
-            return grouped_similar_articles
+        return grouped_similar_articles

    except psycopg2.Error as e:
        print(f"Error: {e}")
@@ -101,7 +49,6 @@ def processing_similar():
        grouped_similar_articles_result = find_and_group_similar_articles()

        if grouped_similar_articles_result:
-
            for group in grouped_similar_articles_result:
                articles = []

@@ -112,8 +59,8 @@ def processing_similar():
                            article = [title, link]
                            articles.append(article)
                    l = len(articles)
+
                    if l == 2:
-                        print("2")
                        a_one = articles[0][0]
                        a_two = articles[1][0]

@@ -141,7 +88,6 @@ def processing_similar():
                        modify_similar_data(similar_d, a_two)
                        preparing_articles(False, a_two)

-                        print(tokens)
                        if tokens > 2000:
                            combined_text = f"{text1} {text2}"
                            combined_text = slice_text_at_2k_tokens(combined_text)
@@ -150,7 +96,6 @@ def processing_similar():
                            user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."

                    if l == 3:
-                        print("3")
                        a_one = articles[0][0]
                        a_two = articles[1][0]
                        a_three = articles[2][0]
@@ -190,13 +135,82 @@ def processing_similar():
                        modify_similar_data(similar_d, a_three)
                        preparing_articles(False, a_three)

-                        print(tokens)
                        if tokens > 2000:
                            combined_text = f"{text1} {text2} {text3}"
                            combined_text = slice_text_at_2k_tokens(combined_text)
                            user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
                        else:
                            user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
+                    if l == 4:
+                        print("4")
+                        a_one = articles[0][0]
+                        a_two = articles[1][0]
+                        a_three = articles[2][0]
+                        a_four = articles[3][0]
+
+                        get_one = get_specific_data(a_one)
+                        get_two = get_specific_data(a_two)
+                        get_three = get_specific_data(a_three)
+                        get_four = get_specific_data(a_four)
+
+                        text1 = get_one[0][1]
+                        text2 = get_two[0][1]
+                        text3 = get_three[0][1]
+                        text4 = get_four[0][1]
+                        link1 = get_one[0][2]
+                        link2 = get_two[0][2]
+                        link3 = get_three[0][2]
+                        link4 = get_four[0][2]
+
+                        if link1 != link2:
+                            if link2 != link3:
+                                if link3 != link4:
+                                    link = f"{link1}, {link2}, {link3}, {link4}"
+                                else:
+                                    link = f"{link1}, {link2}, {link3}"
+                            else:
+                                if link3 != link4:
+                                    link = f"{link1}, {link2}, {link4}"
+                                else:
+                                    link = f"{link1}, {link2}"
+                        else:
+                            if link2 != link3:
+                                if link3 != link4:
+                                    link = f"{link1}, {link3}, {link4}"
+                                else:
+                                    link = f"{link1}, {link3}"
+                            else:
+                                if link3 != link4:
+                                    link = f"{link1}, {link4}"
+                                else:
+                                    link = link1
+
+                        ftoks = num_tokens_from_string(text1)
+                        stoks = num_tokens_from_string(text2)
+                        ttoks = num_tokens_from_string(text3)
+                        frtoks = num_tokens_from_string(text4)
+
+                        tokens = ftoks + stoks + ttoks + frtoks
+
+                        similar_d = f"C: {a_one}, {a_two}, {a_three}, {a_four}"
+                        modify_similar_data(similar_d, a_one)
+                        preparing_articles(False, a_one)
+
+                        modify_similar_data(similar_d, a_two)
+                        preparing_articles(False, a_two)
+
+                        modify_similar_data(similar_d, a_three)
+                        preparing_articles(False, a_three)
+
+                        modify_similar_data(similar_d, a_four)
+                        preparing_articles(False, a_four)
+
+                        if tokens > 2000:
+                            combined_text = f"{text1} {text2} {text3} {text4}"
+                            combined_text = slice_text_at_2k_tokens(combined_text)
+                            user_message = rf"Here is text {combined_text}, combined from 4 sources, filter text, and make news content, return as JSON only with a single 'content' field"
+                        else:
+                            user_message = rf"Here are 4 texts {text1} {text2} {text3} and {text4}, combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single 'content' field."
                    try:
                        completion = client.chat.completions.create(
                            model="gpt-3.5-turbo",
@@ -216,16 +230,11 @@ def processing_similar():

                    except Exception as e:
                        print(f"Error: {e}")
-                        print(f"Title: {a_one}")
-                        print(f"Answer: {generated_text}")
+                        print(a_one)
                        continue
+            else:
+                print("Done!.")
        else:
            print("No similar articles found.")
 if __name__=="__main__":
    processing_similar()
-ready = get_ready_data()
-if ready:
-    for a in ready:
-        print(f"Title: {a[0]}")
-        print(f"Link: {a[2]}")
-        print(f"Status: {a[3]}")
--- a/pyth/scrapingsingle.py
+++ b/pyth/scrapingsingle.py
@@ -1,10 +1,10 @@
 from bs4 import BeautifulSoup
 import requests
 from urllib.parse import urljoin
-from openai import OpenAI , APIError 
+from openai import OpenAI 
 import os
 from langchain.embeddings import OpenAIEmbeddings
-from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data,get_all_links,cleansing ,modify_similar_data)
+from vectData import (insert_data ,is_similar_data ,get_all_links,cleansing )
 import json
 from dotenv import load_dotenv
 import tiktoken
@@ -39,7 +39,7 @@ def slice_text_at_2k_tokens(text):

    sliced_tokens = tokens[:max_tokens]
    sliced_text = encoding.decode(sliced_tokens)
-    
+
    return sliced_text


@@ -82,7 +82,6 @@ def get_article_links(url, already_checked):
        return link_store


-
 already_checked = set()

 for dlink in dlinks:
@@ -116,8 +115,6 @@ if __name__ == '__main__':
    
    title_text = replace_with_spaces(title_text)

-    
-    print(f"Tokens usage: {num_tokens_from_string(text_text, 'gpt-3.5-turbo')}")
    text_text = slice_text_at_2k_tokens(text_text)
    text_text = replace_with_spaces(str(text_text))
    
@@ -138,13 +135,6 @@ if __name__ == '__main__':
        title = response_data["title"]
        text = response_data["content"]

-        #print("*********************************")
-        #print(f"Title: {title}")
-        #print("---------------------------------")
-        #print(f"Content : {text}")
-        #print("*********************************")
-
-
        vector = embeddings.embed_query(generated_text)
        
        if not is_similar_data(title, text, link, vector, threshold=0.98):
--- a/pyth/vectData.py
+++ b/pyth/vectData.py
@@ -7,7 +7,6 @@ import os
 from dotenv import load_dotenv
 from datetime import datetime ,timedelta

-
 load_dotenv()

 host = os.getenv("DB_HOST")
@@ -27,20 +26,20 @@ conn = psycopg2.connect(
 def calculate_cosine_similarity(v1, v2):
    v1_normalized = v1 / np.linalg.norm(v1)
    v2_normalized = v2 / np.linalg.norm(v2)
-
    similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
    return similarity

-def is_similar_data(title, text, link, embedding, threshold=0.98):
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        dbname=dbname
-    )
-    cursor = conn.cursor()
+def parse_embedding_string(embedding_str):
+    if isinstance(embedding_str, str):
+        numbers = [float(num) for num in embedding_str[1:-1].split(',')]
+        return np.array(numbers)
+    elif isinstance(embedding_str, np.ndarray):
+        return embedding_str
+    else:
+        raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.")

+def is_similar_data(title, text, link, embedding, threshold=0.98):
+    cursor = conn.cursor()
    cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;')
    existing_embeddings = cursor.fetchall()

@@ -54,12 +53,12 @@ def is_similar_data(title, text, link, embedding, threshold=0.98):
                similar_d = existing_title
                insert_data(title,text,link,embedding,similar_d)
                print(f"Similar data found: \n #{title} \n #{existing_title}")
-                print(f"Inserting: #{title} \n")
+                print(f"Inserting: #{title}")
                similar_d = "NO"
                cursor.close()
                return True
            else:
-                print(f"Same source of same article!")
+                print(f"Same article of same source!")
                cursor.close()
                return True

@@ -68,13 +67,6 @@ def is_similar_data(title, text, link, embedding, threshold=0.98):
    return False

 def get_similar():
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        dbname=dbname
-    )
    cursor = conn.cursor()
    query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
    cursor.execute(query)
@@ -82,73 +74,49 @@ def get_similar():
    cursor.close()
    return similar_data

+def get_titles_links_embeddings():
+    cursor = conn.cursor()
+    cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;')
+    data = cursor.fetchall()
+    cursor.close()
+
+    titles = [row[0] for row in data]
+    links = [row[1] for row in data]
+    embeddings = [parse_embedding_string(row[2]) for row in data]
+
+    return titles, links, embeddings


 def insert_data(title, text, link, embedding, similar_d):
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        dbname=dbname
-    )
    c_time = datetime.now()
-
-
    cursor = conn.cursor()
-
    cursor.execute('''
        INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready)
        VALUES (%s, %s, %s, %s, %s ,%s ,%s);
    ''', (title, text, link, embedding , similar_d, c_time, True))
-
    conn.commit()
-
    cursor.close()

 def get_data():
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        dbname=dbname
-    )
+
    cursor = conn.cursor()
    query = '''SELECT title,text,link FROM vectorsvevijesti;'''
-
    cursor.execute(query)
    data = cursor.fetchall()
    cursor.close()
    return data

 def get_ready_data():
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        dbname=dbname
-    )
    cursor = conn.cursor()
    query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
-
    cursor.execute(query, ('True',))
    data = cursor.fetchall()
    cursor.close()
    return data

 def get_source_data():
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        dbname=dbname
-    )
    cursor = conn.cursor()
    query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
-
    cursor.execute(query, ('False',))
    data = cursor.fetchall()
    cursor.close()
@@ -156,138 +124,60 @@ def get_source_data():


 def modify_similar_data(new_value ,title):
-
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        dbname=dbname
-    ) 
    cursor = conn.cursor()
-
    query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s '''
-
    cursor.execute(query, (new_value, title))
-
    conn.commit()


 def preparing_articles(new_value ,title):
-
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        dbname=dbname
-    ) 
    cursor = conn.cursor()
-
    query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s '''
-
    cursor.execute(query, (new_value, title))
-
    conn.commit()

 def get_specific_data(title):
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        dbname=dbname
-    )
    cursor = conn.cursor()
-    query = '''SELECT title, text, link, similar_d, embedding FROM vectorsvevijesti WHERE title = %s'''
+    query = '''SELECT title, text, link, similar_d, embedding, ready FROM vectorsvevijesti WHERE title = %s'''
    cursor.execute(query, (title,))
-    
    specific_post = cursor.fetchall()
    cursor.close()
    return specific_post

+
 def get_all_links():
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        dbname=dbname
-    )
    cursor = conn.cursor()
    query = '''SELECT link FROM vectorsvevijesti'''
    cursor.execute(query)
-
    db_links = {link[0] for link in cursor.fetchall()}
    cursor.close()
    return db_links

 def delete_specific(title):
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        dbname=dbname
-    )
-
    cursor = conn.cursor()
    query = '''DELETE FROM vectorsvevijesti WHERE title = %s'''
-
    cursor.execute(query,(title,))
    cursor.close()

 def cleansing():
-
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        dbname=dbname
-    )
-
    day_long = datetime.now() - timedelta(days=1)
-
    cursor = conn.cursor()
-
    query = '''DELETE FROM vectorsvevijesti WHERE time < %s'''
    cursor.execute(query,(day_long,))
-
    conn.commit()
    cursor.close()

 def drop_table():
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        dbname=dbname
-    )
-
    cursor = conn.cursor()
-
    query = '''DROP TABLE IF EXISTS vectorsvevijesti;'''
    cursor.execute(query)
-
    conn.commit()
    cursor.close()

-def create_db(conn):
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        dbname=dbname
-    )
+def create_db():
    cursor = conn.cursor()
-
    cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
-
    register_vector(conn)
-
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS vectorsvevijesti (
            id bigserial PRIMARY KEY,
@@ -298,10 +188,8 @@ def create_db(conn):
            similar_d VARCHAR,
            time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            ready BOOLEAN
-
        );
    ''')
-
    conn.commit()
    cursor.close()
-create_db(conn)
+create_db()