from bs4 import BeautifulSoup import requests from urllib.parse import urljoin from openai import OpenAI , APIError import os from langchain.embeddings import OpenAIEmbeddings from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data, delete_specific,get_all_links,cleansing ,modify_similar_data) import json from dotenv import load_dotenv import tiktoken load_dotenv() cleansing() OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") client = OpenAI() embeddings = OpenAIEmbeddings() dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info'] headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'} def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int: encoding = tiktoken.encoding_for_model(model) return len(encoding.encode(string)) def slice_text_at_2k_tokens(text): encoding_name = "gpt-3.5-turbo" max_tokens = 2000 encoding = tiktoken.encoding_for_model(encoding_name) tokens = encoding.encode(text) if len(tokens) <= max_tokens: return [text] sliced_tokens = tokens[:max_tokens] sliced_text = encoding.decode(sliced_tokens) return sliced_text def replace_with_spaces(text): allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 " cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text) return cleaned_text total_links = set() collected_news = set() def get_article_links(url, already_checked): response = requests.get(url,headers) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') articles = soup.find_all('article') link_store = [] for article in articles: links = article.find_all('a', href=True) for link in links: link_value = urljoin(url, link['href']) if link_value not in already_checked: link_store.append(link_value) already_checked.add(link_value) return link_store already_checked = set() for dlink in dlinks: temp_links = get_article_links(dlink, already_checked) if temp_links: total_links.update(temp_links) final_links = {item for item in total_links if item} i = 0 db_links = set(get_all_links()) new_links = final_links - db_links final_links = new_links if __name__ == '__main__': for link in final_links: response = requests.get(link,headers) soup = BeautifulSoup(response.text, 'html.parser') titles = soup.find_all(['h2', 'h1','h3']) title_text = ' '.join([title.get_text(strip=True) for title in titles]) texts = soup.find_all(['p']) text_text = ' '.join([text.get_text(strip=True) for text in texts]) text_text = text_text title_text = title_text title_text = replace_with_spaces(title_text) print(f"Tokens usage: {num_tokens_from_string(text_text, 'gpt-3.5-turbo')}") text_text = slice_text_at_2k_tokens(text_text) text_text = replace_with_spaces(str(text_text)) try: completion = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "Data analytic, Journalist and News reporter"}, {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."} ] ) generated_text = completion.choices[0].message.content generated_text = generated_text response_data = json.loads(generated_text) title = response_data["title"] text = response_data["content"] #print("*********************************") #print(f"Title: {title}") #print("---------------------------------") #print(f"Content : {text}") #print("*********************************") vector = embeddings.embed_query(generated_text) if not is_similar_data(title, text, link, vector, threshold=0.98): similar_d = "NO" insert_data(title, text, link, vector,similar_d) except Exception as e: print(f"Error in completion: {e}") continue def comb_similar(): print("Checking similar") similar_article = get_similar() grouped_data = {} for sa in similar_article: if similar_article: first_t = get_specific_data(sa[0]) second_t = get_specific_data(sa[1]) link_f = first_t[0][2] link_s = second_t[0][2] f_text = first_t[0][1] s_text = second_t[0][1] f_title = first_t[0][0] s_title = second_t[0][0] if f_title in grouped_data: grouped_data[f_title].append((f_text, link_f)) else: grouped_data[f_title] = [(f_text, link_f)] if s_title in grouped_data: grouped_data[s_title].append((s_text, link_s)) else: grouped_data[s_title] = [(s_text, link_s)] for title, tuples in grouped_data.items(): if len(tuples) == 3: text1, link1 = tuples[0] text2, link2 = tuples[1] text3, link3 = tuples[2] t1check = num_tokens_from_string(text1) t2check = num_tokens_from_string(text2) t3check = num_tokens_from_string(text3) slice_if_more = t1check,t2check,t3check if slice_if_more < 2000: combined_text = f"{text1}{text2}{text3}" combined_text = slice_text_at_2k_tokens(combined_text) user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with 'content' field" link = f"{link1} {link2} {link3}" else: user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field." link = f"{link1} {link2} {link3}" else: ftcheck = num_tokens_from_string(f_text) stcheck = num_tokens_from_string(s_text) fscomb = ftcheck + stcheck if fscomb <2000: combined_text = f"{f_text}{s_text}" user_message = rf"Here is text {combined_text}, combined from 2 sources, filter text, and make news content, return as JSON only with 'content' field" link = f"{link_f} {link_s}" else: user_message = rf"Here are 2 texts {f_text} and {s_text}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field." link = f"{link_f} {link_s}" try: completion = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "Data analytic, Journalist and News reporter"}, {"role": "user", "content": user_message} ] ) generated_text = completion.choices[0].message.content generated_text = generated_text if similar_article: if f_title == s_title: print(f_title) modify_similar_data(first_t,"SOURCE") similar_article.remove(sa) print("Modified") else: print(f"Second: {s_title}") modify_similar_data(first_t,"SOURCE") modify_similar_data(second_t,"SOURCE") similar_article.remove(sa) print("Modified") else: print("Similar list is empty") response_data = json.loads(generated_text) title = f_title text = response_data["content"] vector = embeddings.embed_query(generated_text) if not is_similar_data(title, text, link, vector, threshold=0.98): similar_d = "NO" insert_data(title, text, link, vector, similar_d) except Exception as e: print(f"Error in completion: {e}") continue comb_similar()