from bs4 import BeautifulSoup import requests from urllib.parse import urljoin from openai import OpenAI import os from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores.pgvector import PGVector from vectData import insert_data ,is_similar_data import json os.environ["OPENAI_API_KEY"] = "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" client = OpenAI() embeddings = OpenAIEmbeddings() dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info'] headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'} total_links = set() collected_news = set() def get_article_links(url, already_checked): response = requests.get(url,headers) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') articles = soup.find_all('article') link_store = [] for article in articles: links = article.find_all('a', href=True) for link in links: link_value = urljoin(url, link['href']) if link_value not in already_checked: link_store.append(link_value) already_checked.add(link_value) return link_store already_checked = set() for dlink in dlinks: temp_links = get_article_links(dlink, already_checked) if temp_links: total_links.update(temp_links) final_links = {item for item in total_links if item} for link in final_links: response = requests.get(link,headers) soup = BeautifulSoup(response.text, 'html.parser') titles = soup.find_all(['h2', 'h1','h3']) title_text = ' '.join([title.get_text(strip=True) for title in titles]) texts = soup.find_all(['p']) text_text = ' '.join([text.get_text(strip=True) for text in texts]) try: completion = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "Data analytic, Journalist and News reporter"}, {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."} ] ) generated_text = completion.choices[0].message.content response_data = json.loads(generated_text) title = response_data["title"] text = response_data["content"] print("*********************************") print(f"Title: {title}") print("---------------------------------") print(f"Content : {text}") print("*********************************") vector = embeddings.embed_query(generated_text) if not is_similar_data(title, text, link, vector, threshold=0.9): insert_data(title, text, link, vector) except Exception as e: print(f"Error in completion: {e}") continue