Combine similar article

2024-01-02 15:00:07 +01:00
parent fff1c94a3d
commit ae1c1902da
15 changed files with 726 additions and 39 deletions
--- a/pyth/scrapingsingle.py
+++ b/pyth/scrapingsingle.py
@@ -1,15 +1,20 @@
 from bs4 import BeautifulSoup
 import requests
 from urllib.parse import urljoin
-from openai import OpenAI
+from openai import OpenAI , APIError 
 import os
 from langchain.embeddings import OpenAIEmbeddings
-from langchain.vectorstores.pgvector import PGVector
-from vectData import insert_data ,is_similar_data 
+from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data, delete_specific,get_all_links,cleansing ,modify_similar_data)
 import json
+from dotenv import load_dotenv
+import tiktoken


-os.environ["OPENAI_API_KEY"] = "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"
+load_dotenv()
+cleansing()
+
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+
 client = OpenAI()
 embeddings = OpenAIEmbeddings()

@@ -17,9 +22,36 @@ dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
 headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}


+
+def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:
+    encoding = tiktoken.encoding_for_model(model)
+    return len(encoding.encode(string))
+
+def slice_text_at_2k_tokens(text):
+    encoding_name = "gpt-3.5-turbo"
+    max_tokens = 2000
+
+    encoding = tiktoken.encoding_for_model(encoding_name)
+    tokens = encoding.encode(text)
+
+    if len(tokens) <= max_tokens:
+        return [text] 
+
+    sliced_tokens = tokens[:max_tokens]
+    sliced_text = encoding.decode(sliced_tokens)
+    
+    return sliced_text
+
+
+def replace_with_spaces(text):
+    allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐđŠšŽž0123456789 "
+    cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)
+    return cleaned_text
+
 total_links = set()
 collected_news = set()

+
 def get_article_links(url, already_checked):
    response = requests.get(url,headers)
    if response.status_code == 200:
@@ -36,6 +68,8 @@ def get_article_links(url, already_checked):
                    already_checked.add(link_value)
        return link_store

+
+
 already_checked = set()

 for dlink in dlinks:
@@ -44,8 +78,17 @@ for dlink in dlinks:
        total_links.update(temp_links)

 final_links = {item for item in total_links if item}
+i = 0 

-for link in final_links:
+db_links = set(get_all_links())
+new_links = final_links - db_links
+final_links = new_links
+
+
+
+if __name__ == '__main__':
+
+ for link in final_links:
    response = requests.get(link,headers)
    soup = BeautifulSoup(response.text, 'html.parser')

@@ -54,6 +97,16 @@ for link in final_links:

    texts = soup.find_all(['p'])
    text_text = ' '.join([text.get_text(strip=True) for text in texts])
+
+    text_text = text_text
+    title_text = title_text
+    
+    title_text = replace_with_spaces(title_text)
+
+    
+    print(f"Tokens usage: {num_tokens_from_string(text_text, 'gpt-3.5-turbo')}")
+    text_text = slice_text_at_2k_tokens(text_text)
+    text_text = replace_with_spaces(str(text_text))
    
    try:
        completion = client.chat.completions.create(
@@ -65,23 +118,130 @@ for link in final_links:
        )
        generated_text = completion.choices[0].message.content

+        generated_text = generated_text
+
        response_data = json.loads(generated_text)
        
        title = response_data["title"]
        text = response_data["content"]

-        print("*********************************")
-        print(f"Title: {title}")
-        print("---------------------------------")
-        print(f"Content : {text}")
-        print("*********************************")
+        #print("*********************************")
+        #print(f"Title: {title}")
+        #print("---------------------------------")
+        #print(f"Content : {text}")
+        #print("*********************************")


        vector = embeddings.embed_query(generated_text)
-
-        if not is_similar_data(title, text, link, vector, threshold=0.9):
-         insert_data(title, text, link, vector)
        
+        if not is_similar_data(title, text, link, vector, threshold=0.98):
+         similar_d = "NO"
+         insert_data(title, text, link, vector,similar_d)
+
    except Exception as e:
        print(f"Error in completion: {e}")
        continue
+
+def comb_similar():
+
+    print("Checking similar")
+    similar_article = get_similar()
+
+    grouped_data = {}
+
+
+    for sa in similar_article:
+        if similar_article:
+            first_t = get_specific_data(sa[0])
+            second_t = get_specific_data(sa[1])
+            link_f = first_t[0][2]
+            link_s = second_t[0][2]
+            f_text = first_t[0][1]
+            s_text = second_t[0][1]
+            f_title = first_t[0][0]
+            s_title = second_t[0][0]
+
+            if f_title in grouped_data:
+                grouped_data[f_title].append((f_text, link_f))
+            else:
+                grouped_data[f_title] = [(f_text, link_f)]
+
+            if s_title in grouped_data:
+                  grouped_data[s_title].append((s_text, link_s))
+            else:
+                 grouped_data[s_title] = [(s_text, link_s)]
+
+            for title, tuples in grouped_data.items():
+                if len(tuples) == 3:
+                    text1, link1 = tuples[0]
+                    text2, link2 = tuples[1]
+                    text3, link3 = tuples[2]
+
+                    t1check = num_tokens_from_string(text1)
+                    t2check = num_tokens_from_string(text2)
+                    t3check = num_tokens_from_string(text3)
+                    slice_if_more = t1check,t2check,t3check
+                    if slice_if_more < 2000:
+                        combined_text = f"{text1}{text2}{text3}"
+                        combined_text = slice_text_at_2k_tokens(combined_text)
+                        user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with 'content' field"
+                        link = f"{link1} {link2} {link3}"
+
+                    else:
+                        user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field."
+                        link = f"{link1} {link2} {link3}"
+
+                else:
+                    ftcheck = num_tokens_from_string(f_text)
+                    stcheck = num_tokens_from_string(s_text)
+                    fscomb = ftcheck + stcheck
+                    if fscomb <2000:
+                        combined_text = f"{f_text}{s_text}"
+                        user_message = rf"Here is text {combined_text}, combined from 2 sources, filter text, and make news content, return as JSON only with 'content' field"
+                        link = f"{link_f} {link_s}"
+
+                    else:
+                        user_message = rf"Here are 2 texts {f_text} and {s_text}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field."
+                        link = f"{link_f} {link_s}"
+
+            try:
+                completion = client.chat.completions.create(
+                    model="gpt-3.5-turbo",
+                    messages=[
+                        {"role": "system", "content": "Data analytic, Journalist and News reporter"},
+                        {"role": "user", "content": user_message}
+                    ]
+                )
+                generated_text = completion.choices[0].message.content
+                generated_text = generated_text
+
+                if similar_article:
+                    if f_title == s_title:
+                        print(f_title)
+                        modify_similar_data(first_t,"SOURCE")
+                        similar_article.remove(sa)
+                        print("Modified")
+                    else:
+                        print(f"Second: {s_title}")
+                        modify_similar_data(first_t,"SOURCE")
+                        modify_similar_data(second_t,"SOURCE")
+                        similar_article.remove(sa)
+                        print("Modified")
+                else:
+                    print("Similar list is empty")
+
+                response_data = json.loads(generated_text)
+                title = f_title
+                text = response_data["content"]
+
+                vector = embeddings.embed_query(generated_text)
+
+                if not is_similar_data(title, text, link, vector, threshold=0.98):
+                    similar_d = "NO"
+                    insert_data(title, text, link, vector, similar_d)
+
+            except Exception as e:
+                print(f"Error in completion: {e}")
+                continue
+
+comb_similar()