Fixed response/JSON

2024-01-08 00:28:20 +01:00
parent b7a0e5478c
commit 54a41046ce
4 changed files with 23 additions and 23 deletions
--- a/pyth/pycache/scrapingsingle.cpython-310.pyc
+++ b/pyth/pycache/scrapingsingle.cpython-310.pyc
--- a/pyth/pycache/vectData.cpython-310.pyc
+++ b/pyth/pycache/vectData.cpython-310.pyc
--- a/pyth/articles.py
+++ b/pyth/articles.py
@@ -7,6 +7,7 @@ from langchain.embeddings import OpenAIEmbeddings
 from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings
 from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens
 import json
+from json_repair import repair_json

 load_dotenv()

@@ -16,7 +17,6 @@ embeddings = OpenAIEmbeddings()

 print(f"Checking for similar!")

-
 def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
    try:
        titles, links, embeddings = get_titles_links_embeddings()
@@ -142,7 +142,6 @@ def processing_similar():
                        else:
                            user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
                    if l == 4:
-                        print("4")
                        a_one = articles[0][0]
                        a_two = articles[1][0]
                        a_three = articles[2][0]
@@ -220,6 +219,8 @@ def processing_similar():
                            ])
                        generated_text = completion.choices[0].message.content

+                        generated_text = repair_json(generated_text)
+
                        response_data = json.loads(generated_text)
                        title = a_one
                        text = response_data["content"]
--- a/pyth/scrapingsingle.py
+++ b/pyth/scrapingsingle.py
@@ -8,7 +8,7 @@ from vectData import (insert_data ,is_similar_data ,get_all_links,cleansing )
 import json
 from dotenv import load_dotenv
 import tiktoken
-
+from json_repair import repair_json

 load_dotenv()
 cleansing()
@@ -21,50 +21,50 @@ embeddings = OpenAIEmbeddings()
 dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
 headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}

-
-
 def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(string))

 def slice_text_at_2k_tokens(text):
    encoding_name = "gpt-3.5-turbo"
-    max_tokens = 2000
-
+    max_tokens = 1950
    encoding = tiktoken.encoding_for_model(encoding_name)
    tokens = encoding.encode(text)
-
    if len(tokens) <= max_tokens:
        return [text] 
-
    sliced_tokens = tokens[:max_tokens]
    sliced_text = encoding.decode(sliced_tokens)
-
    return sliced_text

+def slice_title_if_needed(text):
+    encoding_name = "gpt-3.5-turbo"
+    max_tokens = 100
+    encoding = tiktoken.encoding_for_model(encoding_name)
+    tokens = encoding.encode(text)
+    if len(tokens) <= max_tokens:
+        return [text] 
+    sliced_tokens = tokens[:max_tokens]
+    sliced_text = encoding.decode(sliced_tokens)
+    return sliced_text

 def replace_with_spaces(text):
    allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐđŠšŽž0123456789 "
    cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)
    return cleaned_text

-
 def fix_links(links_set):
    modified_links = set()
-
    for link in links_set:
        if "www" in link:
            modified_link = link.replace("www.", "")
            modified_links.add(modified_link)
        else:
            modified_links.add(link)
-
    return modified_links

 total_links = set()
 collected_news = set()

-
 def get_article_links(url, already_checked):
    response = requests.get(url,headers)
    if response.status_code == 200:
@@ -81,25 +81,22 @@ def get_article_links(url, already_checked):
                    already_checked.add(link_value)
        return link_store

-
 already_checked = set()

 for dlink in dlinks:
    temp_links = get_article_links(dlink, already_checked)
    if temp_links:
        total_links.update(temp_links)
-
 final_links = {item for item in total_links if item}

 db_links = set(get_all_links())
 new_links = final_links - db_links
 final_links = new_links
 final_links = set(final_links)
-
 final_links = fix_links(final_links)

 if __name__ == '__main__':
-
+ 
 for link in final_links:
    response = requests.get(link,headers)
    soup = BeautifulSoup(response.text, 'html.parser')
@@ -117,24 +114,26 @@ if __name__ == '__main__':

    text_text = slice_text_at_2k_tokens(text_text)
    text_text = replace_with_spaces(str(text_text))
-    
+
+    ttk = num_tokens_from_string(text_text)
+
+    if ttk > 1900:
+        title_text = slice_title_if_needed(title_text)
    try:
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "Data analytic, Journalist and News reporter"},
-                {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
+                {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data make sure that its valid JSON object with 'title' field and 'content' field."}
            ]
        )
        generated_text = completion.choices[0].message.content

-        generated_text = generated_text
+        generated_text = repair_json(generated_text)

        response_data = json.loads(generated_text)
-        
        title = response_data["title"]
        text = response_data["content"]
-
        vector = embeddings.embed_query(generated_text)
        
        if not is_similar_data(title, text, link, vector, threshold=0.98):