From 96a2d888953319b0b2593dd504ad461f6073c4b5 Mon Sep 17 00:00:00 2001 From: Amir Sabani Date: Sat, 6 Jan 2024 08:26:31 +0100 Subject: [PATCH] Removing previous f. --- pyth/scrapingsingle.py | 112 ----------------------------------------- 1 file changed, 112 deletions(-) diff --git a/pyth/scrapingsingle.py b/pyth/scrapingsingle.py index 44ff2eb..ac86b52 100644 --- a/pyth/scrapingsingle.py +++ b/pyth/scrapingsingle.py @@ -155,115 +155,3 @@ if __name__ == '__main__': print(f"Error in completion: {e}") continue - -def comb_similar(): - - print("Checking similar") - similar_article = get_similar() - - grouped_data = {} - - - for sa in similar_article: - if similar_article: - first_t = get_specific_data(sa[0]) - second_t = get_specific_data(sa[1]) - link_f = first_t[0][2] - link_s = second_t[0][2] - f_text = first_t[0][1] - s_text = second_t[0][1] - f_title = first_t[0][0] - s_title = second_t[0][0] - - if f_title in grouped_data: - grouped_data[f_title].append((f_text, link_f)) - else: - grouped_data[f_title] = [(f_text, link_f)] - - if s_title in grouped_data: - grouped_data[s_title].append((s_text, link_s)) - else: - grouped_data[s_title] = [(s_text, link_s)] - - for title, tuples in grouped_data.items(): - if len(tuples) == 3: - text1, link1 = tuples[0] - text2, link2 = tuples[1] - text3, link3 = tuples[2] - - t1check = num_tokens_from_string(text1) - t2check = num_tokens_from_string(text2) - t3check = num_tokens_from_string(text3) - slice_if_more = t1check,t2check,t3check - if slice_if_more < 2000: - combined_text = f"{text1}{text2}{text3}" - combined_text = slice_text_at_2k_tokens(combined_text) - user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with 'content' field" - if link1 != link2 and link1 != link3 and link2 != link3: - link = f"{link1} {link2} {link3}" - else: - link = link1 - - else: - user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field." - if link1 != link2 and link1 != link3 and link2 != link3: - link = f"{link1} {link2} {link3}" - else: - link = link1 - else: - ftcheck = num_tokens_from_string(f_text) - stcheck = num_tokens_from_string(s_text) - fscomb = ftcheck + stcheck - if fscomb <2000: - combined_text = f"{f_text}{s_text}" - user_message = rf"Here is text {combined_text}, combined from 2 sources, filter text, and make news content, return as JSON only with 'content' field" - if link_f != link_s: - link = f"{link_f} {link_s}" - else: - link = link_f - - else: - user_message = rf"Here are 2 texts {f_text} and {s_text}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field." - if link_f != link_s: - link = f"{link_f} {link_s}" - else: - link = link_f - try: - completion = client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "system", "content": "Data analytic, Journalist and News reporter"}, - {"role": "user", "content": user_message} - ] - ) - generated_text = completion.choices[0].message.content - - if similar_article: - if f_title == s_title: - print(f_title) - modify_similar_data(first_t,"SOURCE") - similar_article.remove(sa) - print("Modified") - else: - print(f"First: {f_title}") - print(f"Second: {s_title}") - modify_similar_data(first_t,"SOURCE") - modify_similar_data(second_t,"SOURCE") - similar_article.remove(sa) - print("Modified") - else: - print("Similar list is empty") - - response_data = json.loads(generated_text) - title = f_title - text = response_data["content"] - - vector = embeddings.embed_query(generated_text) - - if not is_similar_data(title, text, link, vector, threshold=0.98): - similar_d = "NO" - insert_data(title, text, link, vector, similar_d) - - except Exception as e: - print(f"Error in completion: {e}") - continue