added article.py

2024-01-06 08:17:05 +01:00
parent ae1c1902da
commit d4e99c7c5f
8 changed files with 329 additions and 18 deletions
--- a/pyth/pycache/articles.cpython-310.pyc
+++ b/pyth/pycache/articles.cpython-310.pyc
--- a/pyth/pycache/scrapingsingle.cpython-310.pyc
+++ b/pyth/pycache/scrapingsingle.cpython-310.pyc
--- a/pyth/pycache/vectData.cpython-310.pyc
+++ b/pyth/pycache/vectData.cpython-310.pyc
--- a/pyth/articles.py
+++ b/pyth/articles.py
@@ -0,0 +1,231 @@
+import psycopg2
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from dotenv import load_dotenv
+import os
+from openai import OpenAI , APIError 
+from langchain.embeddings import OpenAIEmbeddings
+from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, get_source_data, get_ready_data
+import tiktoken
+from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens
+import json
+
+load_dotenv()
+
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+client = OpenAI()
+embeddings = OpenAIEmbeddings()
+
+print(f"Checking for similar!")
+
+host = os.getenv("DB_HOST")
+port = os.getenv("DB_PORT")
+user = os.getenv("DB_USER")
+password = os.getenv("DB_PASSWORD")
+dbname = os.getenv("DB_NAME")
+
+def calculate_cosine_similarity(v1, v2):
+    v1_normalized = v1 / np.linalg.norm(v1)
+    v2_normalized = v2 / np.linalg.norm(v2)
+
+    similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
+    return similarity
+
+def parse_embedding_string(embedding_str):
+    if isinstance(embedding_str, str):
+        numbers = [float(num) for num in embedding_str[1:-1].split(',')]
+        return np.array(numbers)
+    elif isinstance(embedding_str, np.ndarray):
+        return embedding_str
+    else:
+        raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.")
+
+
+def get_titles_links_embeddings():
+    conn = psycopg2.connect(
+        host=host,
+        port=port,
+        user=user,
+        password=password,
+        dbname=dbname
+    )
+    cursor = conn.cursor()
+    cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;')
+    data = cursor.fetchall()
+    cursor.close()
+
+    titles = [row[0] for row in data]
+    links = [row[1] for row in data]
+    embeddings = [parse_embedding_string(row[2]) for row in data]
+
+    return titles, links, embeddings
+
+def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
+    try:
+        conn = psycopg2.connect(
+            host=host,
+            port=port,
+            user=user,
+            password=password,
+            dbname=dbname
+        )
+
+        with conn, conn.cursor() as cursor:
+            titles, links, embeddings = get_titles_links_embeddings()
+
+            processed_articles = set()
+            grouped_similar_articles = []
+
+            for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
+                if (title1, link1) not in processed_articles:
+                    processed_articles.add((title1, link1))
+                    group = [(title1, link1)]
+
+                    for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
+                        if i != j and (title2, link2) not in processed_articles:
+                            similarity = calculate_cosine_similarity(embedding1, embedding2)
+
+                            if similarity > threshold:
+                                processed_articles.add((title2, link2))
+                                group.append((title2, link2))
+
+                    grouped_similar_articles.append(group)
+
+            return grouped_similar_articles
+
+    except psycopg2.Error as e:
+        print(f"Error: {e}")
+        return []
+    
+def processing_similar():
+        grouped_similar_articles_result = find_and_group_similar_articles()
+
+        if grouped_similar_articles_result:
+
+            for group in grouped_similar_articles_result:
+                articles = []
+
+                if len(group) > 1:
+                    for article_tuple in group:
+                        if len(article_tuple) >= 2:
+                            title, link = article_tuple[:2]
+                            article = [title, link]
+                            articles.append(article)
+                    l = len(articles)
+                    if l == 2:
+                        print("2")
+                        a_one = articles[0][0]
+                        a_two = articles[1][0]
+
+                        get_one = get_specific_data(a_one)
+                        get_two = get_specific_data(a_two)
+
+                        text1 = get_one[0][1]
+                        text2 = get_two[0][1]
+                        link1 = get_one[0][2]
+                        link2 = get_two[0][2]
+                        if link1 != link2:
+                            link = f"{link1}, {link2}"
+                        else:
+                            link = link1
+
+                        ftoks = num_tokens_from_string(text1)
+                        stoks = num_tokens_from_string(text2)
+                        tokens = ftoks + stoks
+
+                        similar_d = f"C: {a_one}, {a_two}"
+
+                        modify_similar_data(similar_d, a_one)
+                        preparing_articles(False, a_one)
+
+                        modify_similar_data(similar_d, a_two)
+                        preparing_articles(False, a_two)
+
+                        print(tokens)
+                        if tokens > 2000:
+                            combined_text = f"{text1} {text2}"
+                            combined_text = slice_text_at_2k_tokens(combined_text)
+                            user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
+                        else:
+                            user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
+
+                    if l == 3:
+                        print("3")
+                        a_one = articles[0][0]
+                        a_two = articles[1][0]
+                        a_three = articles[2][0]
+
+                        get_one = get_specific_data(a_one)
+                        get_two = get_specific_data(a_two)
+                        get_three = get_specific_data(a_three)
+
+                        text1 = get_one[0][1]
+                        text2 = get_two[0][1]
+                        text3 = get_three[0][1]
+                        link1 = get_one[0][2]
+                        link2 = get_two[0][2]
+                        link3 = get_three[0][2]
+                        if link1 != link2:
+                            if link2 != link3:
+                                link = f"{link1}, {link2}, {link3}"
+                            else:
+                                link = f"{link1}, {link2}"
+                        else:
+                            if link2 != link3:
+                                link = f"{link1}, {link3}"
+                            else:
+                                link = link1
+                        ftoks = num_tokens_from_string(text1)
+                        stoks = num_tokens_from_string(text2)
+                        ttoks = num_tokens_from_string(text3)
+                        tokens = ftoks + stoks + ttoks
+
+                        similar_d = f"C: {a_one}, {a_two}, {a_three}"
+                        modify_similar_data(similar_d, a_one)
+                        preparing_articles(False, a_one)
+
+                        modify_similar_data(similar_d, a_two)
+                        preparing_articles(False, a_two)
+
+                        modify_similar_data(similar_d, a_three)
+                        preparing_articles(False, a_three)
+
+                        print(tokens)
+                        if tokens > 2000:
+                            combined_text = f"{text1} {text2} {text3}"
+                            combined_text = slice_text_at_2k_tokens(combined_text)
+                            user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
+                        else:
+                            user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
+                    try:
+                        completion = client.chat.completions.create(
+                            model="gpt-3.5-turbo",
+                            messages=[
+                                {"role": "system", "content": "Data analytic, Journalist and News reporter"},
+                                {"role": "user", "content": user_message}
+                            ])
+                        generated_text = completion.choices[0].message.content
+
+                        response_data = json.loads(generated_text)
+                        title = a_one
+                        text = response_data["content"]
+                        vector = embeddings.embed_query(generated_text)
+
+                        insert_data(title, text, link, vector, similar_d)
+                        print(f"Inserting combined: {title}")
+
+                    except Exception as e:
+                        print(f"Error: {e}")
+                        print(f"Title: {a_one}")
+                        print(f"Answer: {generated_text}")
+                        continue
+        else:
+            print("No similar articles found.")
+if __name__=="__main__":
+    processing_similar()
+ready = get_ready_data()
+if ready:
+    for a in ready:
+        print(f"Title: {a[0]}")
+        print(f"Link: {a[2]}")
+        print(f"Status: {a[3]}")
--- a/pyth/scrapingsingle.py
+++ b/pyth/scrapingsingle.py
@@ -4,7 +4,7 @@ from urllib.parse import urljoin
 from openai import OpenAI , APIError 
 import os
 from langchain.embeddings import OpenAIEmbeddings
-from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data, delete_specific,get_all_links,cleansing ,modify_similar_data)
+from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data,get_all_links,cleansing ,modify_similar_data)
 import json
 from dotenv import load_dotenv
 import tiktoken
@@ -48,6 +48,19 @@ def replace_with_spaces(text):
    cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)
    return cleaned_text

+
+def fix_links(links_set):
+    modified_links = set()
+
+    for link in links_set:
+        if "www" in link:
+            modified_link = link.replace("www.", "")
+            modified_links.add(modified_link)
+        else:
+            modified_links.add(link)
+
+    return modified_links
+
 total_links = set()
 collected_news = set()

@@ -78,13 +91,13 @@ for dlink in dlinks:
        total_links.update(temp_links)

 final_links = {item for item in total_links if item}
-i = 0 

 db_links = set(get_all_links())
 new_links = final_links - db_links
 final_links = new_links
+final_links = set(final_links)

-
+final_links = fix_links(final_links)

 if __name__ == '__main__':

@@ -142,6 +155,7 @@ if __name__ == '__main__':
        print(f"Error in completion: {e}")
        continue

+
 def comb_similar():

    print("Checking similar")
@@ -185,12 +199,17 @@ def comb_similar():
                        combined_text = f"{text1}{text2}{text3}"
                        combined_text = slice_text_at_2k_tokens(combined_text)
                        user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with 'content' field"
-                        link = f"{link1} {link2} {link3}"
+                        if link1 != link2 and link1 != link3 and link2 != link3:
+                            link = f"{link1} {link2} {link3}"
+                        else:
+                            link = link1

                    else:
                        user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field."
-                        link = f"{link1} {link2} {link3}"
-
+                        if link1 != link2 and link1 != link3 and link2 != link3:
+                            link = f"{link1} {link2} {link3}"
+                        else:
+                            link = link1
                else:
                    ftcheck = num_tokens_from_string(f_text)
                    stcheck = num_tokens_from_string(s_text)
@@ -198,12 +217,17 @@ def comb_similar():
                    if fscomb <2000:
                        combined_text = f"{f_text}{s_text}"
                        user_message = rf"Here is text {combined_text}, combined from 2 sources, filter text, and make news content, return as JSON only with 'content' field"
-                        link = f"{link_f} {link_s}"
+                        if link_f != link_s:
+                            link = f"{link_f} {link_s}"
+                        else:
+                            link = link_f

                    else:
                        user_message = rf"Here are 2 texts {f_text} and {s_text}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field."
-                        link = f"{link_f} {link_s}"
-
+                        if link_f != link_s:
+                            link = f"{link_f} {link_s}"
+                        else:
+                            link = link_f
            try:
                completion = client.chat.completions.create(
                    model="gpt-3.5-turbo",
@@ -213,7 +237,6 @@ def comb_similar():
                    ]
                )
                generated_text = completion.choices[0].message.content
-                generated_text = generated_text

                if similar_article:
                    if f_title == s_title:
@@ -222,6 +245,7 @@ def comb_similar():
                        similar_article.remove(sa)
                        print("Modified")
                    else:
+                        print(f"First: {f_title}")
                        print(f"Second: {s_title}")
                        modify_similar_data(first_t,"SOURCE")
                        modify_similar_data(second_t,"SOURCE")
@@ -243,5 +267,3 @@ def comb_similar():
            except Exception as e:
                print(f"Error in completion: {e}")
                continue
-
-comb_similar()
--- a/pyth/templates/index.html
+++ b/pyth/templates/index.html
@@ -18,6 +18,5 @@
            <a href="/article/two">Second</a>
        </article>
    </div>
-    
 </body>
 </html>
--- a/pyth/vectData.py
+++ b/pyth/vectData.py
@@ -83,6 +83,7 @@ def get_similar():
    return similar_data


+
 def insert_data(title, text, link, embedding, similar_d):
    conn = psycopg2.connect(
        host=host,
@@ -97,9 +98,9 @@ def insert_data(title, text, link, embedding, similar_d):
    cursor = conn.cursor()

    cursor.execute('''
-        INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time)
-        VALUES (%s, %s, %s, %s, %s ,%s);
-    ''', (title, text, link, embedding , similar_d, c_time))
+        INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready)
+        VALUES (%s, %s, %s, %s, %s ,%s ,%s);
+    ''', (title, text, link, embedding , similar_d, c_time, True))

    conn.commit()

@@ -121,6 +122,39 @@ def get_data():
    cursor.close()
    return data

+def get_ready_data():
+    conn = psycopg2.connect(
+        host=host,
+        port=port,
+        user=user,
+        password=password,
+        dbname=dbname
+    )
+    cursor = conn.cursor()
+    query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
+
+    cursor.execute(query, ('True',))
+    data = cursor.fetchall()
+    cursor.close()
+    return data
+
+def get_source_data():
+    conn = psycopg2.connect(
+        host=host,
+        port=port,
+        user=user,
+        password=password,
+        dbname=dbname
+    )
+    cursor = conn.cursor()
+    query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
+
+    cursor.execute(query, ('False',))
+    data = cursor.fetchall()
+    cursor.close()
+    return data
+
+
 def modify_similar_data(new_value ,title):

    conn = psycopg2.connect(
@@ -138,6 +172,24 @@ def modify_similar_data(new_value ,title):

    conn.commit()

+
+def preparing_articles(new_value ,title):
+
+    conn = psycopg2.connect(
+        host=host,
+        port=port,
+        user=user,
+        password=password,
+        dbname=dbname
+    ) 
+    cursor = conn.cursor()
+
+    query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s '''
+
+    cursor.execute(query, (new_value, title))
+
+    conn.commit()
+
 def get_specific_data(title):
    conn = psycopg2.connect(
        host=host,
@@ -244,7 +296,9 @@ def create_db(conn):
            link VARCHAR,
            embedding vector(1536),
            similar_d VARCHAR,
-            time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+            time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            ready BOOLEAN
+
        );
    ''')

--- a/pyth/web-server.py
+++ b/pyth/web-server.py
@@ -1,5 +1,5 @@
 from flask import Flask , render_template , jsonify
-from vectData import get_data
+from vectData import get_ready_data
 from flask_cors import CORS


@@ -21,4 +21,9 @@ def articleone():
 def articletwo():
    return render_template("two.html")

+@app.route('/data/get/news', methods=['GET'])
+def takenews():
+    data = get_ready_data()
+    return jsonify(data)
+
 app.run(debug=True)