From 954ae97a9683c07e3cd5533e29733e60d7e37742 Mon Sep 17 00:00:00 2001 From: Amir Sabani Date: Mon, 25 Dec 2023 12:31:55 +0100 Subject: [PATCH] Adding VDB --- pyth/scrapingsingle.py | 87 +++++++++++++++++++++++++++++++ pyth/singlearticle.py | 32 ------------ pyth/vectData.py | 115 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 202 insertions(+), 32 deletions(-) create mode 100644 pyth/scrapingsingle.py delete mode 100644 pyth/singlearticle.py create mode 100644 pyth/vectData.py diff --git a/pyth/scrapingsingle.py b/pyth/scrapingsingle.py new file mode 100644 index 0000000..e03be09 --- /dev/null +++ b/pyth/scrapingsingle.py @@ -0,0 +1,87 @@ +from bs4 import BeautifulSoup +import requests +from urllib.parse import urljoin +from openai import OpenAI +import os +from langchain.embeddings import OpenAIEmbeddings +from langchain.vectorstores.pgvector import PGVector +from vectData import insert_data ,is_similar_data +import json + + +os.environ["OPENAI_API_KEY"] = "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" +client = OpenAI() +embeddings = OpenAIEmbeddings() + +dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info'] +headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'} + + +total_links = set() +collected_news = set() + +def get_article_links(url, already_checked): + response = requests.get(url,headers) + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'html.parser') + articles = soup.find_all('article') + link_store = [] + + for article in articles: + links = article.find_all('a', href=True) + for link in links: + link_value = urljoin(url, link['href']) + if link_value not in already_checked: + link_store.append(link_value) + already_checked.add(link_value) + return link_store + +already_checked = set() + +for dlink in dlinks: + temp_links = get_article_links(dlink, already_checked) + if temp_links: + total_links.update(temp_links) + +final_links = {item for item in total_links if item} + +for link in final_links: + response = requests.get(link,headers) + soup = BeautifulSoup(response.text, 'html.parser') + + titles = soup.find_all(['h2', 'h1','h3']) + title_text = ' '.join([title.get_text(strip=True) for title in titles]) + + texts = soup.find_all(['p']) + text_text = ' '.join([text.get_text(strip=True) for text in texts]) + + try: + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "Data analytic, Journalist and News reporter"}, + {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."} + ] + ) + generated_text = completion.choices[0].message.content + + response_data = json.loads(generated_text) + + title = response_data["title"] + text = response_data["content"] + + print("*********************************") + print(f"Title: {title}") + print("---------------------------------") + print(f"Content : {text}") + print("*********************************") + + + vector = embeddings.embed_query(generated_text) + + if not is_similar_data(title, text, link, vector, threshold=0.9): + insert_data(title, text, link, vector) + + except Exception as e: + print(f"Error in completion: {e}") + continue diff --git a/pyth/singlearticle.py b/pyth/singlearticle.py deleted file mode 100644 index 781e26c..0000000 --- a/pyth/singlearticle.py +++ /dev/null @@ -1,32 +0,0 @@ -import requests -from openai import OpenAI -import os -from bs4 import BeautifulSoup - -os.environ["OPENAI_API_KEY"] = "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" - -client = OpenAI() - -urls = ['https://klix.ba/', 'https://srpskainfo.com/', 'https://bljesak.info/'] - - -for url in urls: - response = requests.get(url) - html = response.text - soup = BeautifulSoup(html, 'html.parser') - tags = soup.find_all(['h2', 'p']) - - prompt_text = '' - for tag in tags: - text = tag.get_text(strip=True) - prompt_text = prompt_text + text - - completion = client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "system", "content": "Data analytic, Journalist and News reporter"}, - {"role": "user", "content": f"Extract for me evry title and full content for evry title from {prompt_text},without shortening,remove all thing that are not connected to news, make it clear for reading"} - ] - ) - generated_text = completion.choices[0].message.content - print(f"Text for {url}: \n {generated_text}\n") diff --git a/pyth/vectData.py b/pyth/vectData.py new file mode 100644 index 0000000..dd1e2d7 --- /dev/null +++ b/pyth/vectData.py @@ -0,0 +1,115 @@ +import psycopg2 +from psycopg2 import sql +from pgvector.psycopg2 import register_vector +from sklearn.metrics.pairwise import cosine_similarity +import numpy as np + +host = 'localhost' +port = '5432' +user = 'postgres' +password = 'salmonela pljusti 221 hamo' +dbname = 'vector_svw' + +def calculate_cosine_similarity(v1, v2): + v1_normalized = v1 / np.linalg.norm(v1) + v2_normalized = v2 / np.linalg.norm(v2) + + similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0] + return similarity + +def is_similar_data(title, text, link, embedding, threshold=0.9): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + + cursor.execute('SELECT title ,embedding FROM vectorsvevijesti;') + existing_embeddings = cursor.fetchall() + + for existing_embedding_tuple in existing_embeddings: + existing_title = existing_embedding_tuple[0] + existing_embedding = np.array(existing_embedding_tuple[1]).flatten() + similarity = calculate_cosine_similarity(existing_embedding, embedding) + if similarity > threshold: + print(f"Similar data found: \n #{title} \n #{existing_title}") + cursor.close() + conn.close() + return True + + print(f"Inserting: #{title}") + cursor.close() + conn.close() + return False + +def insert_data(title, text, link, embedding): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + + cursor.execute(''' + INSERT INTO vectorsvevijesti (title, text, link, embedding) + VALUES (%s, %s, %s, %s); + ''', (title, text, link, embedding)) + + conn.commit() + + cursor.close() + conn.close() + +def get_data(): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + query = '''SELECT title,text,link FROM vectorsvevijesti;''' + + cursor.execute(query) + data = cursor.fetchall() + cursor.close() + conn.close() + + return data + +def create_db(): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + + cursor.execute("CREATE EXTENSION IF NOT EXISTS vector") + + register_vector(conn) + + cursor.execute("DROP TABLE IF EXISTS vectorsvevijesti;") + + cursor.execute(''' + CREATE TABLE vectorsvevijesti ( + id bigserial PRIMARY KEY, + title VARCHAR, + text VARCHAR, + link VARCHAR, + embedding vector(1536) + ); + ''') + + conn.commit() + cursor.close() + conn.close() +create_db() \ No newline at end of file