Files
old-svevijesti/pyth/scrapingsingle.py
2023-12-25 12:31:55 +01:00

88 lines
3.0 KiB
Python

from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
from openai import OpenAI
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector
from vectData import insert_data ,is_similar_data
import json
os.environ["OPENAI_API_KEY"] = "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"
client = OpenAI()
embeddings = OpenAIEmbeddings()
dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
total_links = set()
collected_news = set()
def get_article_links(url, already_checked):
response = requests.get(url,headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')
link_store = []
for article in articles:
links = article.find_all('a', href=True)
for link in links:
link_value = urljoin(url, link['href'])
if link_value not in already_checked:
link_store.append(link_value)
already_checked.add(link_value)
return link_store
already_checked = set()
for dlink in dlinks:
temp_links = get_article_links(dlink, already_checked)
if temp_links:
total_links.update(temp_links)
final_links = {item for item in total_links if item}
for link in final_links:
response = requests.get(link,headers)
soup = BeautifulSoup(response.text, 'html.parser')
titles = soup.find_all(['h2', 'h1','h3'])
title_text = ' '.join([title.get_text(strip=True) for title in titles])
texts = soup.find_all(['p'])
text_text = ' '.join([text.get_text(strip=True) for text in texts])
try:
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
]
)
generated_text = completion.choices[0].message.content
response_data = json.loads(generated_text)
title = response_data["title"]
text = response_data["content"]
print("*********************************")
print(f"Title: {title}")
print("---------------------------------")
print(f"Content : {text}")
print("*********************************")
vector = embeddings.embed_query(generated_text)
if not is_similar_data(title, text, link, vector, threshold=0.9):
insert_data(title, text, link, vector)
except Exception as e:
print(f"Error in completion: {e}")
continue