import psycopg2 from psycopg2 import sql from pgvector.psycopg2 import register_vector from sklearn.metrics.pairwise import cosine_similarity import numpy as np import os from dotenv import load_dotenv from datetime import datetime ,timedelta load_dotenv() host = os.getenv("DB_HOST") port = os.getenv("DB_PORT") user = os.getenv("DB_USER") password = os.getenv("DB_PASSWORD") dbname = os.getenv("DB_NAME") conn = psycopg2.connect( host=host, port=port, user=user, password=password, dbname=dbname ) def calculate_cosine_similarity(v1, v2): v1_normalized = v1 / np.linalg.norm(v1) v2_normalized = v2 / np.linalg.norm(v2) similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0] return similarity def parse_embedding_string(embedding_str): if isinstance(embedding_str, str): numbers = [float(num) for num in embedding_str[1:-1].split(',')] return np.array(numbers) elif isinstance(embedding_str, np.ndarray): return embedding_str else: raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.") def is_similar_data(title, text, link, embedding, threshold=0.98): cursor = conn.cursor() cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;') existing_embeddings = cursor.fetchall() for existing_embedding_tuple in existing_embeddings: existing_title = existing_embedding_tuple[0] existing_embedding = np.array(existing_embedding_tuple[1]).flatten() existing_link = existing_embedding_tuple[2] similarity = calculate_cosine_similarity(existing_embedding, embedding) if similarity > threshold: if link != existing_link: similar_d = existing_title insert_data(title,text,link,embedding,similar_d) print(f"Similar data found: \n #{title} \n #{existing_title}") print(f"Inserting: #{title}") similar_d = "NO" cursor.close() return True else: print(f"Same article of same source!") cursor.close() return True print(f"Inserting: #{title}") cursor.close() return False def get_similar(): cursor = conn.cursor() query = '''SELECT title, link, similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')''' cursor.execute(query) similar_data = cursor.fetchall() cursor.close() return similar_data def get_titles_links_embeddings(): cursor = conn.cursor() cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;') data = cursor.fetchall() cursor.close() titles = [row[0] for row in data] links = [row[1] for row in data] embeddings = [parse_embedding_string(row[2]) for row in data] return titles, links, embeddings def insert_data(title, text, link, embedding, similar_d,category): c_time = datetime.now() cursor = conn.cursor() cursor.execute(''' INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready, category) VALUES (%s, %s, %s, %s, %s ,%s ,%s ,%s); ''', (title, text, link, embedding , similar_d, c_time, True , category)) conn.commit() cursor.close() def insert_final(title,text,slug,link,source_id, category): with conn.cursor() as cursor: cursor.execute('''INSERT INTO articles (title, content, slug, original_url, source_id, category) VALUES (%s, %s, %s, %s, %s, %s)ON CONFLICT (original_url) DO NOTHING;''',(title , text, slug, link, source_id, category)) conn.commit() def get_data(): cursor = conn.cursor() query = '''SELECT title,text,link FROM vectorsvevijesti;''' cursor.execute(query) data = cursor.fetchall() cursor.close() return data def get_ready_data(): cursor = conn.cursor() query = '''SELECT title, text, link, time, similar_d, category FROM vectorsvevijesti WHERE ready = %s;''' cursor.execute(query, ('True',)) data = cursor.fetchall() cursor.close() return data def get_source_data(): cursor = conn.cursor() query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;''' cursor.execute(query, ('False',)) data = cursor.fetchall() cursor.close() return data def modify_similar_data(new_value ,title): cursor = conn.cursor() query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s ''' cursor.execute(query, (new_value, title)) conn.commit() def preparing_articles(new_value ,title): cursor = conn.cursor() query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s ''' cursor.execute(query, (new_value, title)) conn.commit() def get_specific_data(title): cursor = conn.cursor() query = '''SELECT title, text, link, similar_d, embedding, category, ready FROM vectorsvevijesti WHERE title = %s''' cursor.execute(query, (title,)) specific_post = cursor.fetchall() cursor.close() return specific_post def get_all_links(): cursor = conn.cursor() query = '''SELECT link FROM vectorsvevijesti''' cursor.execute(query) db_links = {link[0] for link in cursor.fetchall()} cursor.close() return db_links def get_existing_titles(): cursor = conn.cursor() query = '''SELECT title, original_url FROM articles''' cursor.execute(query) db_links = {link[0] for link in cursor.fetchall()} cursor.close() return db_links def delete_specific(title): cursor = conn.cursor() query = '''DELETE FROM vectorsvevijesti WHERE title = %s''' cursor.execute(query,(title,)) cursor.close() def cleansing(): day_long = datetime.now() - timedelta(days=1) cursor = conn.cursor() query = '''DELETE FROM vectorsvevijesti WHERE time < %s''' cursor.execute(query,(day_long,)) conn.commit() cursor.close() def drop_table(): cursor = conn.cursor() query = '''DROP TABLE IF EXISTS vectorsvevijesti;''' cursor.execute(query) conn.commit() cursor.close() def create_db(): cursor = conn.cursor() cursor.execute("CREATE EXTENSION IF NOT EXISTS vector") register_vector(conn) cursor.execute(''' CREATE TABLE IF NOT EXISTS vectorsvevijesti ( id bigserial PRIMARY KEY, title VARCHAR, text VARCHAR, link VARCHAR, embedding vector(1536), similar_d VARCHAR, time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, ready BOOLEAN ); ''') conn.commit() cursor.close() def create_db(): cursor = conn.cursor() cursor.execute("CREATE EXTENSION IF NOT EXISTS vector") register_vector(conn) cursor.execute(''' CREATE TABLE IF NOT EXISTS vectorsvevijesti ( id bigserial PRIMARY KEY, title VARCHAR, text VARCHAR, link VARCHAR, embedding vector(1536), similar_d VARCHAR, time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, ready BOOLEAN, category VARCHAR ); ''') conn.commit() cursor.close() def create_ar_table(): cursor = conn.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS "articles" ( "id" bigserial PRIMARY KEY, "title" text NOT NULL UNIQUE, "content" text NOT NULL, "slug" text NOT NULL UNIQUE, "created_at" timestamptz DEFAULT NOW() NOT NULL, "original_url" text NOT NULL UNIQUE, "source_id" int NOT NULL, "category" VARCHAR ); ''') conn.commit() cursor.close() import psycopg2 from psycopg2 import sql create_db() create_ar_table()