import psycopg2 from psycopg2 import sql from pgvector.psycopg2 import register_vector from sklearn.metrics.pairwise import cosine_similarity import numpy as np import os from dotenv import load_dotenv from datetime import datetime ,timedelta load_dotenv() host = os.getenv("DB_HOST") port = os.getenv("DB_PORT") user = os.getenv("DB_USER") password = os.getenv("DB_PASSWORD") dbname = os.getenv("DB_NAME") conn = psycopg2.connect( host=host, port=port, user=user, password=password, dbname=dbname ) def calculate_cosine_similarity(v1, v2): v1_normalized = v1 / np.linalg.norm(v1) v2_normalized = v2 / np.linalg.norm(v2) similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0] return similarity def is_similar_data(title, text, link, embedding, threshold=0.98): conn = psycopg2.connect( host=host, port=port, user=user, password=password, dbname=dbname ) cursor = conn.cursor() cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;') existing_embeddings = cursor.fetchall() for existing_embedding_tuple in existing_embeddings: existing_title = existing_embedding_tuple[0] existing_embedding = np.array(existing_embedding_tuple[1]).flatten() existing_link = existing_embedding_tuple[2] similarity = calculate_cosine_similarity(existing_embedding, embedding) if similarity > threshold: if link != existing_link: similar_d = existing_title insert_data(title,text,link,embedding,similar_d) print(f"Similar data found: \n #{title} \n #{existing_title}") print(f"Inserting: #{title} \n") similar_d = "NO" cursor.close() return True else: print(f"Same source of same article!") cursor.close() return True print(f"Inserting: #{title}") cursor.close() return False def get_similar(): conn = psycopg2.connect( host=host, port=port, user=user, password=password, dbname=dbname ) cursor = conn.cursor() query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')''' cursor.execute(query) similar_data = cursor.fetchall() cursor.close() return similar_data def insert_data(title, text, link, embedding, similar_d): conn = psycopg2.connect( host=host, port=port, user=user, password=password, dbname=dbname ) c_time = datetime.now() cursor = conn.cursor() cursor.execute(''' INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time) VALUES (%s, %s, %s, %s, %s ,%s); ''', (title, text, link, embedding , similar_d, c_time)) conn.commit() cursor.close() def get_data(): conn = psycopg2.connect( host=host, port=port, user=user, password=password, dbname=dbname ) cursor = conn.cursor() query = '''SELECT title,text,link FROM vectorsvevijesti;''' cursor.execute(query) data = cursor.fetchall() cursor.close() return data def modify_similar_data(new_value ,title): conn = psycopg2.connect( host=host, port=port, user=user, password=password, dbname=dbname ) cursor = conn.cursor() query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s ''' cursor.execute(query, (new_value, title)) conn.commit() def get_specific_data(title): conn = psycopg2.connect( host=host, port=port, user=user, password=password, dbname=dbname ) cursor = conn.cursor() query = '''SELECT title, text, link, similar_d, embedding FROM vectorsvevijesti WHERE title = %s''' cursor.execute(query, (title,)) specific_post = cursor.fetchall() cursor.close() return specific_post def get_all_links(): conn = psycopg2.connect( host=host, port=port, user=user, password=password, dbname=dbname ) cursor = conn.cursor() query = '''SELECT link FROM vectorsvevijesti''' cursor.execute(query) db_links = {link[0] for link in cursor.fetchall()} cursor.close() return db_links def delete_specific(title): conn = psycopg2.connect( host=host, port=port, user=user, password=password, dbname=dbname ) cursor = conn.cursor() query = '''DELETE FROM vectorsvevijesti WHERE title = %s''' cursor.execute(query,(title,)) cursor.close() def cleansing(): conn = psycopg2.connect( host=host, port=port, user=user, password=password, dbname=dbname ) day_long = datetime.now() - timedelta(days=1) cursor = conn.cursor() query = '''DELETE FROM vectorsvevijesti WHERE time < %s''' cursor.execute(query,(day_long,)) conn.commit() cursor.close() def drop_table(): conn = psycopg2.connect( host=host, port=port, user=user, password=password, dbname=dbname ) cursor = conn.cursor() query = '''DROP TABLE IF EXISTS vectorsvevijesti;''' cursor.execute(query) conn.commit() cursor.close() def create_db(conn): conn = psycopg2.connect( host=host, port=port, user=user, password=password, dbname=dbname ) cursor = conn.cursor() cursor.execute("CREATE EXTENSION IF NOT EXISTS vector") register_vector(conn) cursor.execute(''' CREATE TABLE IF NOT EXISTS vectorsvevijesti ( id bigserial PRIMARY KEY, title VARCHAR, text VARCHAR, link VARCHAR, embedding vector(1536), similar_d VARCHAR, time TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); ''') conn.commit() cursor.close() create_db(conn)