Adding VDB
This commit is contained in:
87
pyth/scrapingsingle.py
Normal file
87
pyth/scrapingsingle.py
Normal file
@@ -0,0 +1,87 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from urllib.parse import urljoin
|
||||
from openai import OpenAI
|
||||
import os
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from langchain.vectorstores.pgvector import PGVector
|
||||
from vectData import insert_data ,is_similar_data
|
||||
import json
|
||||
|
||||
|
||||
os.environ["OPENAI_API_KEY"] = "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"
|
||||
client = OpenAI()
|
||||
embeddings = OpenAIEmbeddings()
|
||||
|
||||
dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
|
||||
|
||||
|
||||
total_links = set()
|
||||
collected_news = set()
|
||||
|
||||
def get_article_links(url, already_checked):
|
||||
response = requests.get(url,headers)
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
articles = soup.find_all('article')
|
||||
link_store = []
|
||||
|
||||
for article in articles:
|
||||
links = article.find_all('a', href=True)
|
||||
for link in links:
|
||||
link_value = urljoin(url, link['href'])
|
||||
if link_value not in already_checked:
|
||||
link_store.append(link_value)
|
||||
already_checked.add(link_value)
|
||||
return link_store
|
||||
|
||||
already_checked = set()
|
||||
|
||||
for dlink in dlinks:
|
||||
temp_links = get_article_links(dlink, already_checked)
|
||||
if temp_links:
|
||||
total_links.update(temp_links)
|
||||
|
||||
final_links = {item for item in total_links if item}
|
||||
|
||||
for link in final_links:
|
||||
response = requests.get(link,headers)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
titles = soup.find_all(['h2', 'h1','h3'])
|
||||
title_text = ' '.join([title.get_text(strip=True) for title in titles])
|
||||
|
||||
texts = soup.find_all(['p'])
|
||||
text_text = ' '.join([text.get_text(strip=True) for text in texts])
|
||||
|
||||
try:
|
||||
completion = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
||||
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
|
||||
]
|
||||
)
|
||||
generated_text = completion.choices[0].message.content
|
||||
|
||||
response_data = json.loads(generated_text)
|
||||
|
||||
title = response_data["title"]
|
||||
text = response_data["content"]
|
||||
|
||||
print("*********************************")
|
||||
print(f"Title: {title}")
|
||||
print("---------------------------------")
|
||||
print(f"Content : {text}")
|
||||
print("*********************************")
|
||||
|
||||
|
||||
vector = embeddings.embed_query(generated_text)
|
||||
|
||||
if not is_similar_data(title, text, link, vector, threshold=0.9):
|
||||
insert_data(title, text, link, vector)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in completion: {e}")
|
||||
continue
|
||||
Reference in New Issue
Block a user