diff --git a/pyth/__pycache__/scrapingsingle.cpython-310.pyc b/pyth/__pycache__/scrapingsingle.cpython-310.pyc index 2598ed5..b39ce5c 100644 Binary files a/pyth/__pycache__/scrapingsingle.cpython-310.pyc and b/pyth/__pycache__/scrapingsingle.cpython-310.pyc differ diff --git a/pyth/__pycache__/vectData.cpython-310.pyc b/pyth/__pycache__/vectData.cpython-310.pyc index 9cb85af..e806a8a 100644 Binary files a/pyth/__pycache__/vectData.cpython-310.pyc and b/pyth/__pycache__/vectData.cpython-310.pyc differ diff --git a/pyth/articles.py b/pyth/articles.py index 56d5c5a..b5ae49f 100644 --- a/pyth/articles.py +++ b/pyth/articles.py @@ -7,6 +7,7 @@ from langchain.embeddings import OpenAIEmbeddings from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens import json +from json_repair import repair_json load_dotenv() @@ -16,7 +17,6 @@ embeddings = OpenAIEmbeddings() print(f"Checking for similar!") - def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95): try: titles, links, embeddings = get_titles_links_embeddings() @@ -142,7 +142,6 @@ def processing_similar(): else: user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." if l == 4: - print("4") a_one = articles[0][0] a_two = articles[1][0] a_three = articles[2][0] @@ -220,6 +219,8 @@ def processing_similar(): ]) generated_text = completion.choices[0].message.content + generated_text = repair_json(generated_text) + response_data = json.loads(generated_text) title = a_one text = response_data["content"] diff --git a/pyth/scrapingsingle.py b/pyth/scrapingsingle.py index e939adb..672ba87 100644 --- a/pyth/scrapingsingle.py +++ b/pyth/scrapingsingle.py @@ -8,7 +8,7 @@ from vectData import (insert_data ,is_similar_data ,get_all_links,cleansing ) import json from dotenv import load_dotenv import tiktoken - +from json_repair import repair_json load_dotenv() cleansing() @@ -21,50 +21,50 @@ embeddings = OpenAIEmbeddings() dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info'] headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'} - - def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int: encoding = tiktoken.encoding_for_model(model) return len(encoding.encode(string)) def slice_text_at_2k_tokens(text): encoding_name = "gpt-3.5-turbo" - max_tokens = 2000 - + max_tokens = 1950 encoding = tiktoken.encoding_for_model(encoding_name) tokens = encoding.encode(text) - if len(tokens) <= max_tokens: return [text] - sliced_tokens = tokens[:max_tokens] sliced_text = encoding.decode(sliced_tokens) - return sliced_text +def slice_title_if_needed(text): + encoding_name = "gpt-3.5-turbo" + max_tokens = 100 + encoding = tiktoken.encoding_for_model(encoding_name) + tokens = encoding.encode(text) + if len(tokens) <= max_tokens: + return [text] + sliced_tokens = tokens[:max_tokens] + sliced_text = encoding.decode(sliced_tokens) + return sliced_text def replace_with_spaces(text): allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 " cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text) return cleaned_text - def fix_links(links_set): modified_links = set() - for link in links_set: if "www" in link: modified_link = link.replace("www.", "") modified_links.add(modified_link) else: modified_links.add(link) - return modified_links total_links = set() collected_news = set() - def get_article_links(url, already_checked): response = requests.get(url,headers) if response.status_code == 200: @@ -81,25 +81,22 @@ def get_article_links(url, already_checked): already_checked.add(link_value) return link_store - already_checked = set() for dlink in dlinks: temp_links = get_article_links(dlink, already_checked) if temp_links: total_links.update(temp_links) - final_links = {item for item in total_links if item} db_links = set(get_all_links()) new_links = final_links - db_links final_links = new_links final_links = set(final_links) - final_links = fix_links(final_links) if __name__ == '__main__': - + for link in final_links: response = requests.get(link,headers) soup = BeautifulSoup(response.text, 'html.parser') @@ -117,24 +114,26 @@ if __name__ == '__main__': text_text = slice_text_at_2k_tokens(text_text) text_text = replace_with_spaces(str(text_text)) - + + ttk = num_tokens_from_string(text_text) + + if ttk > 1900: + title_text = slice_title_if_needed(title_text) try: completion = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "Data analytic, Journalist and News reporter"}, - {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."} + {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data make sure that its valid JSON object with 'title' field and 'content' field."} ] ) generated_text = completion.choices[0].message.content - generated_text = generated_text + generated_text = repair_json(generated_text) response_data = json.loads(generated_text) - title = response_data["title"] text = response_data["content"] - vector = embeddings.embed_query(generated_text) if not is_similar_data(title, text, link, vector, threshold=0.98):