115 lines
2.8 KiB
Python
115 lines
2.8 KiB
Python
import psycopg2
|
|
from psycopg2 import sql
|
|
from pgvector.psycopg2 import register_vector
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
import numpy as np
|
|
|
|
host = 'localhost'
|
|
port = '5432'
|
|
user = 'postgres'
|
|
password = 'salmonela pljusti 221 hamo'
|
|
dbname = 'vector_svw'
|
|
|
|
def calculate_cosine_similarity(v1, v2):
|
|
v1_normalized = v1 / np.linalg.norm(v1)
|
|
v2_normalized = v2 / np.linalg.norm(v2)
|
|
|
|
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
|
|
return similarity
|
|
|
|
def is_similar_data(title, text, link, embedding, threshold=0.9):
|
|
conn = psycopg2.connect(
|
|
host=host,
|
|
port=port,
|
|
user=user,
|
|
password=password,
|
|
dbname=dbname
|
|
)
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute('SELECT title ,embedding FROM vectorsvevijesti;')
|
|
existing_embeddings = cursor.fetchall()
|
|
|
|
for existing_embedding_tuple in existing_embeddings:
|
|
existing_title = existing_embedding_tuple[0]
|
|
existing_embedding = np.array(existing_embedding_tuple[1]).flatten()
|
|
similarity = calculate_cosine_similarity(existing_embedding, embedding)
|
|
if similarity > threshold:
|
|
print(f"Similar data found: \n #{title} \n #{existing_title}")
|
|
cursor.close()
|
|
conn.close()
|
|
return True
|
|
|
|
print(f"Inserting: #{title}")
|
|
cursor.close()
|
|
conn.close()
|
|
return False
|
|
|
|
def insert_data(title, text, link, embedding):
|
|
conn = psycopg2.connect(
|
|
host=host,
|
|
port=port,
|
|
user=user,
|
|
password=password,
|
|
dbname=dbname
|
|
)
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute('''
|
|
INSERT INTO vectorsvevijesti (title, text, link, embedding)
|
|
VALUES (%s, %s, %s, %s);
|
|
''', (title, text, link, embedding))
|
|
|
|
conn.commit()
|
|
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def get_data():
|
|
conn = psycopg2.connect(
|
|
host=host,
|
|
port=port,
|
|
user=user,
|
|
password=password,
|
|
dbname=dbname
|
|
)
|
|
cursor = conn.cursor()
|
|
query = '''SELECT title,text,link FROM vectorsvevijesti;'''
|
|
|
|
cursor.execute(query)
|
|
data = cursor.fetchall()
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
return data
|
|
|
|
def create_db():
|
|
conn = psycopg2.connect(
|
|
host=host,
|
|
port=port,
|
|
user=user,
|
|
password=password,
|
|
dbname=dbname
|
|
)
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
|
|
|
|
register_vector(conn)
|
|
|
|
cursor.execute("DROP TABLE IF EXISTS vectorsvevijesti;")
|
|
|
|
cursor.execute('''
|
|
CREATE TABLE vectorsvevijesti (
|
|
id bigserial PRIMARY KEY,
|
|
title VARCHAR,
|
|
text VARCHAR,
|
|
link VARCHAR,
|
|
embedding vector(1536)
|
|
);
|
|
''')
|
|
|
|
conn.commit()
|
|
cursor.close()
|
|
conn.close()
|
|
create_db() |