Files
old-svevijesti/pyth/vectData.py
2023-12-25 12:31:55 +01:00

115 lines
2.8 KiB
Python

import psycopg2
from psycopg2 import sql
from pgvector.psycopg2 import register_vector
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
host = 'localhost'
port = '5432'
user = 'postgres'
password = 'salmonela pljusti 221 hamo'
dbname = 'vector_svw'
def calculate_cosine_similarity(v1, v2):
v1_normalized = v1 / np.linalg.norm(v1)
v2_normalized = v2 / np.linalg.norm(v2)
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
return similarity
def is_similar_data(title, text, link, embedding, threshold=0.9):
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
cursor = conn.cursor()
cursor.execute('SELECT title ,embedding FROM vectorsvevijesti;')
existing_embeddings = cursor.fetchall()
for existing_embedding_tuple in existing_embeddings:
existing_title = existing_embedding_tuple[0]
existing_embedding = np.array(existing_embedding_tuple[1]).flatten()
similarity = calculate_cosine_similarity(existing_embedding, embedding)
if similarity > threshold:
print(f"Similar data found: \n #{title} \n #{existing_title}")
cursor.close()
conn.close()
return True
print(f"Inserting: #{title}")
cursor.close()
conn.close()
return False
def insert_data(title, text, link, embedding):
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
cursor = conn.cursor()
cursor.execute('''
INSERT INTO vectorsvevijesti (title, text, link, embedding)
VALUES (%s, %s, %s, %s);
''', (title, text, link, embedding))
conn.commit()
cursor.close()
conn.close()
def get_data():
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
cursor = conn.cursor()
query = '''SELECT title,text,link FROM vectorsvevijesti;'''
cursor.execute(query)
data = cursor.fetchall()
cursor.close()
conn.close()
return data
def create_db():
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
cursor = conn.cursor()
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
register_vector(conn)
cursor.execute("DROP TABLE IF EXISTS vectorsvevijesti;")
cursor.execute('''
CREATE TABLE vectorsvevijesti (
id bigserial PRIMARY KEY,
title VARCHAR,
text VARCHAR,
link VARCHAR,
embedding vector(1536)
);
''')
conn.commit()
cursor.close()
conn.close()
create_db()