254 lines
6.0 KiB
Python
254 lines
6.0 KiB
Python
import psycopg2
|
|
from psycopg2 import sql
|
|
from pgvector.psycopg2 import register_vector
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
import numpy as np
|
|
import os
|
|
from dotenv import load_dotenv
|
|
from datetime import datetime ,timedelta
|
|
|
|
|
|
load_dotenv()
|
|
|
|
host = os.getenv("DB_HOST")
|
|
port = os.getenv("DB_PORT")
|
|
user = os.getenv("DB_USER")
|
|
password = os.getenv("DB_PASSWORD")
|
|
dbname = os.getenv("DB_NAME")
|
|
|
|
conn = psycopg2.connect(
|
|
host=host,
|
|
port=port,
|
|
user=user,
|
|
password=password,
|
|
dbname=dbname
|
|
)
|
|
|
|
def calculate_cosine_similarity(v1, v2):
|
|
v1_normalized = v1 / np.linalg.norm(v1)
|
|
v2_normalized = v2 / np.linalg.norm(v2)
|
|
|
|
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
|
|
return similarity
|
|
|
|
def is_similar_data(title, text, link, embedding, threshold=0.98):
|
|
conn = psycopg2.connect(
|
|
host=host,
|
|
port=port,
|
|
user=user,
|
|
password=password,
|
|
dbname=dbname
|
|
)
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;')
|
|
existing_embeddings = cursor.fetchall()
|
|
|
|
for existing_embedding_tuple in existing_embeddings:
|
|
existing_title = existing_embedding_tuple[0]
|
|
existing_embedding = np.array(existing_embedding_tuple[1]).flatten()
|
|
existing_link = existing_embedding_tuple[2]
|
|
similarity = calculate_cosine_similarity(existing_embedding, embedding)
|
|
if similarity > threshold:
|
|
if link != existing_link:
|
|
similar_d = existing_title
|
|
insert_data(title,text,link,embedding,similar_d)
|
|
print(f"Similar data found: \n #{title} \n #{existing_title}")
|
|
print(f"Inserting: #{title} \n")
|
|
similar_d = "NO"
|
|
cursor.close()
|
|
return True
|
|
else:
|
|
print(f"Same source of same article!")
|
|
cursor.close()
|
|
return True
|
|
|
|
print(f"Inserting: #{title}")
|
|
cursor.close()
|
|
return False
|
|
|
|
def get_similar():
|
|
conn = psycopg2.connect(
|
|
host=host,
|
|
port=port,
|
|
user=user,
|
|
password=password,
|
|
dbname=dbname
|
|
)
|
|
cursor = conn.cursor()
|
|
query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
|
|
cursor.execute(query)
|
|
similar_data = cursor.fetchall()
|
|
cursor.close()
|
|
return similar_data
|
|
|
|
|
|
def insert_data(title, text, link, embedding, similar_d):
|
|
conn = psycopg2.connect(
|
|
host=host,
|
|
port=port,
|
|
user=user,
|
|
password=password,
|
|
dbname=dbname
|
|
)
|
|
c_time = datetime.now()
|
|
|
|
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute('''
|
|
INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time)
|
|
VALUES (%s, %s, %s, %s, %s ,%s);
|
|
''', (title, text, link, embedding , similar_d, c_time))
|
|
|
|
conn.commit()
|
|
|
|
cursor.close()
|
|
|
|
def get_data():
|
|
conn = psycopg2.connect(
|
|
host=host,
|
|
port=port,
|
|
user=user,
|
|
password=password,
|
|
dbname=dbname
|
|
)
|
|
cursor = conn.cursor()
|
|
query = '''SELECT title,text,link FROM vectorsvevijesti;'''
|
|
|
|
cursor.execute(query)
|
|
data = cursor.fetchall()
|
|
cursor.close()
|
|
return data
|
|
|
|
def modify_similar_data(new_value ,title):
|
|
|
|
conn = psycopg2.connect(
|
|
host=host,
|
|
port=port,
|
|
user=user,
|
|
password=password,
|
|
dbname=dbname
|
|
)
|
|
cursor = conn.cursor()
|
|
|
|
query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s '''
|
|
|
|
cursor.execute(query, (new_value, title))
|
|
|
|
conn.commit()
|
|
|
|
def get_specific_data(title):
|
|
conn = psycopg2.connect(
|
|
host=host,
|
|
port=port,
|
|
user=user,
|
|
password=password,
|
|
dbname=dbname
|
|
)
|
|
cursor = conn.cursor()
|
|
query = '''SELECT title, text, link, similar_d, embedding FROM vectorsvevijesti WHERE title = %s'''
|
|
cursor.execute(query, (title,))
|
|
|
|
specific_post = cursor.fetchall()
|
|
cursor.close()
|
|
return specific_post
|
|
|
|
def get_all_links():
|
|
conn = psycopg2.connect(
|
|
host=host,
|
|
port=port,
|
|
user=user,
|
|
password=password,
|
|
dbname=dbname
|
|
)
|
|
cursor = conn.cursor()
|
|
query = '''SELECT link FROM vectorsvevijesti'''
|
|
cursor.execute(query)
|
|
|
|
db_links = {link[0] for link in cursor.fetchall()}
|
|
cursor.close()
|
|
return db_links
|
|
|
|
def delete_specific(title):
|
|
conn = psycopg2.connect(
|
|
host=host,
|
|
port=port,
|
|
user=user,
|
|
password=password,
|
|
dbname=dbname
|
|
)
|
|
|
|
cursor = conn.cursor()
|
|
query = '''DELETE FROM vectorsvevijesti WHERE title = %s'''
|
|
|
|
cursor.execute(query,(title,))
|
|
cursor.close()
|
|
|
|
def cleansing():
|
|
|
|
conn = psycopg2.connect(
|
|
host=host,
|
|
port=port,
|
|
user=user,
|
|
password=password,
|
|
dbname=dbname
|
|
)
|
|
|
|
day_long = datetime.now() - timedelta(days=1)
|
|
|
|
cursor = conn.cursor()
|
|
|
|
query = '''DELETE FROM vectorsvevijesti WHERE time < %s'''
|
|
cursor.execute(query,(day_long,))
|
|
|
|
conn.commit()
|
|
cursor.close()
|
|
|
|
def drop_table():
|
|
conn = psycopg2.connect(
|
|
host=host,
|
|
port=port,
|
|
user=user,
|
|
password=password,
|
|
dbname=dbname
|
|
)
|
|
|
|
cursor = conn.cursor()
|
|
|
|
query = '''DROP TABLE IF EXISTS vectorsvevijesti;'''
|
|
cursor.execute(query)
|
|
|
|
conn.commit()
|
|
cursor.close()
|
|
|
|
def create_db(conn):
|
|
conn = psycopg2.connect(
|
|
host=host,
|
|
port=port,
|
|
user=user,
|
|
password=password,
|
|
dbname=dbname
|
|
)
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
|
|
|
|
register_vector(conn)
|
|
|
|
cursor.execute('''
|
|
CREATE TABLE IF NOT EXISTS vectorsvevijesti (
|
|
id bigserial PRIMARY KEY,
|
|
title VARCHAR,
|
|
text VARCHAR,
|
|
link VARCHAR,
|
|
embedding vector(1536),
|
|
similar_d VARCHAR,
|
|
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
''')
|
|
|
|
conn.commit()
|
|
cursor.close()
|
|
create_db(conn)
|