Adding VDB
This commit is contained in:
87
pyth/scrapingsingle.py
Normal file
87
pyth/scrapingsingle.py
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from openai import OpenAI
|
||||||
|
import os
|
||||||
|
from langchain.embeddings import OpenAIEmbeddings
|
||||||
|
from langchain.vectorstores.pgvector import PGVector
|
||||||
|
from vectData import insert_data ,is_similar_data
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
os.environ["OPENAI_API_KEY"] = "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"
|
||||||
|
client = OpenAI()
|
||||||
|
embeddings = OpenAIEmbeddings()
|
||||||
|
|
||||||
|
dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
|
||||||
|
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
|
||||||
|
|
||||||
|
|
||||||
|
total_links = set()
|
||||||
|
collected_news = set()
|
||||||
|
|
||||||
|
def get_article_links(url, already_checked):
|
||||||
|
response = requests.get(url,headers)
|
||||||
|
if response.status_code == 200:
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
articles = soup.find_all('article')
|
||||||
|
link_store = []
|
||||||
|
|
||||||
|
for article in articles:
|
||||||
|
links = article.find_all('a', href=True)
|
||||||
|
for link in links:
|
||||||
|
link_value = urljoin(url, link['href'])
|
||||||
|
if link_value not in already_checked:
|
||||||
|
link_store.append(link_value)
|
||||||
|
already_checked.add(link_value)
|
||||||
|
return link_store
|
||||||
|
|
||||||
|
already_checked = set()
|
||||||
|
|
||||||
|
for dlink in dlinks:
|
||||||
|
temp_links = get_article_links(dlink, already_checked)
|
||||||
|
if temp_links:
|
||||||
|
total_links.update(temp_links)
|
||||||
|
|
||||||
|
final_links = {item for item in total_links if item}
|
||||||
|
|
||||||
|
for link in final_links:
|
||||||
|
response = requests.get(link,headers)
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
|
titles = soup.find_all(['h2', 'h1','h3'])
|
||||||
|
title_text = ' '.join([title.get_text(strip=True) for title in titles])
|
||||||
|
|
||||||
|
texts = soup.find_all(['p'])
|
||||||
|
text_text = ' '.join([text.get_text(strip=True) for text in texts])
|
||||||
|
|
||||||
|
try:
|
||||||
|
completion = client.chat.completions.create(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
||||||
|
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
generated_text = completion.choices[0].message.content
|
||||||
|
|
||||||
|
response_data = json.loads(generated_text)
|
||||||
|
|
||||||
|
title = response_data["title"]
|
||||||
|
text = response_data["content"]
|
||||||
|
|
||||||
|
print("*********************************")
|
||||||
|
print(f"Title: {title}")
|
||||||
|
print("---------------------------------")
|
||||||
|
print(f"Content : {text}")
|
||||||
|
print("*********************************")
|
||||||
|
|
||||||
|
|
||||||
|
vector = embeddings.embed_query(generated_text)
|
||||||
|
|
||||||
|
if not is_similar_data(title, text, link, vector, threshold=0.9):
|
||||||
|
insert_data(title, text, link, vector)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error in completion: {e}")
|
||||||
|
continue
|
||||||
@@ -1,32 +0,0 @@
|
|||||||
import requests
|
|
||||||
from openai import OpenAI
|
|
||||||
import os
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
os.environ["OPENAI_API_KEY"] = "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"
|
|
||||||
|
|
||||||
client = OpenAI()
|
|
||||||
|
|
||||||
urls = ['https://klix.ba/', 'https://srpskainfo.com/', 'https://bljesak.info/']
|
|
||||||
|
|
||||||
|
|
||||||
for url in urls:
|
|
||||||
response = requests.get(url)
|
|
||||||
html = response.text
|
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
|
||||||
tags = soup.find_all(['h2', 'p'])
|
|
||||||
|
|
||||||
prompt_text = ''
|
|
||||||
for tag in tags:
|
|
||||||
text = tag.get_text(strip=True)
|
|
||||||
prompt_text = prompt_text + text
|
|
||||||
|
|
||||||
completion = client.chat.completions.create(
|
|
||||||
model="gpt-3.5-turbo",
|
|
||||||
messages=[
|
|
||||||
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
|
||||||
{"role": "user", "content": f"Extract for me evry title and full content for evry title from {prompt_text},without shortening,remove all thing that are not connected to news, make it clear for reading"}
|
|
||||||
]
|
|
||||||
)
|
|
||||||
generated_text = completion.choices[0].message.content
|
|
||||||
print(f"Text for {url}: \n {generated_text}\n")
|
|
||||||
115
pyth/vectData.py
Normal file
115
pyth/vectData.py
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
import psycopg2
|
||||||
|
from psycopg2 import sql
|
||||||
|
from pgvector.psycopg2 import register_vector
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
host = 'localhost'
|
||||||
|
port = '5432'
|
||||||
|
user = 'postgres'
|
||||||
|
password = 'salmonela pljusti 221 hamo'
|
||||||
|
dbname = 'vector_svw'
|
||||||
|
|
||||||
|
def calculate_cosine_similarity(v1, v2):
|
||||||
|
v1_normalized = v1 / np.linalg.norm(v1)
|
||||||
|
v2_normalized = v2 / np.linalg.norm(v2)
|
||||||
|
|
||||||
|
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
|
||||||
|
return similarity
|
||||||
|
|
||||||
|
def is_similar_data(title, text, link, embedding, threshold=0.9):
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=host,
|
||||||
|
port=port,
|
||||||
|
user=user,
|
||||||
|
password=password,
|
||||||
|
dbname=dbname
|
||||||
|
)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
cursor.execute('SELECT title ,embedding FROM vectorsvevijesti;')
|
||||||
|
existing_embeddings = cursor.fetchall()
|
||||||
|
|
||||||
|
for existing_embedding_tuple in existing_embeddings:
|
||||||
|
existing_title = existing_embedding_tuple[0]
|
||||||
|
existing_embedding = np.array(existing_embedding_tuple[1]).flatten()
|
||||||
|
similarity = calculate_cosine_similarity(existing_embedding, embedding)
|
||||||
|
if similarity > threshold:
|
||||||
|
print(f"Similar data found: \n #{title} \n #{existing_title}")
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
return True
|
||||||
|
|
||||||
|
print(f"Inserting: #{title}")
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def insert_data(title, text, link, embedding):
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=host,
|
||||||
|
port=port,
|
||||||
|
user=user,
|
||||||
|
password=password,
|
||||||
|
dbname=dbname
|
||||||
|
)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
cursor.execute('''
|
||||||
|
INSERT INTO vectorsvevijesti (title, text, link, embedding)
|
||||||
|
VALUES (%s, %s, %s, %s);
|
||||||
|
''', (title, text, link, embedding))
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def get_data():
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=host,
|
||||||
|
port=port,
|
||||||
|
user=user,
|
||||||
|
password=password,
|
||||||
|
dbname=dbname
|
||||||
|
)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
query = '''SELECT title,text,link FROM vectorsvevijesti;'''
|
||||||
|
|
||||||
|
cursor.execute(query)
|
||||||
|
data = cursor.fetchall()
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def create_db():
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=host,
|
||||||
|
port=port,
|
||||||
|
user=user,
|
||||||
|
password=password,
|
||||||
|
dbname=dbname
|
||||||
|
)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
|
||||||
|
|
||||||
|
register_vector(conn)
|
||||||
|
|
||||||
|
cursor.execute("DROP TABLE IF EXISTS vectorsvevijesti;")
|
||||||
|
|
||||||
|
cursor.execute('''
|
||||||
|
CREATE TABLE vectorsvevijesti (
|
||||||
|
id bigserial PRIMARY KEY,
|
||||||
|
title VARCHAR,
|
||||||
|
text VARCHAR,
|
||||||
|
link VARCHAR,
|
||||||
|
embedding vector(1536)
|
||||||
|
);
|
||||||
|
''')
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
create_db()
|
||||||
Reference in New Issue
Block a user