Combine similar article

This commit is contained in:
2024-01-02 15:00:07 +01:00
parent fff1c94a3d
commit ae1c1902da
15 changed files with 726 additions and 39 deletions

View File

@@ -3,12 +3,26 @@ from psycopg2 import sql
from pgvector.psycopg2 import register_vector
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
from dotenv import load_dotenv
from datetime import datetime ,timedelta
host = 'localhost'
port = '5432'
user = 'postgres'
password = 'salmonela pljusti 221 hamo'
dbname = 'vector_svw'
load_dotenv()
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
dbname = os.getenv("DB_NAME")
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
def calculate_cosine_similarity(v1, v2):
v1_normalized = v1 / np.linalg.norm(v1)
@@ -17,7 +31,7 @@ def calculate_cosine_similarity(v1, v2):
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
return similarity
def is_similar_data(title, text, link, embedding, threshold=0.9):
def is_similar_data(title, text, link, embedding, threshold=0.98):
conn = psycopg2.connect(
host=host,
port=port,
@@ -27,25 +41,33 @@ def is_similar_data(title, text, link, embedding, threshold=0.9):
)
cursor = conn.cursor()
cursor.execute('SELECT title ,embedding FROM vectorsvevijesti;')
cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;')
existing_embeddings = cursor.fetchall()
for existing_embedding_tuple in existing_embeddings:
existing_title = existing_embedding_tuple[0]
existing_embedding = np.array(existing_embedding_tuple[1]).flatten()
existing_link = existing_embedding_tuple[2]
similarity = calculate_cosine_similarity(existing_embedding, embedding)
if similarity > threshold:
print(f"Similar data found: \n #{title} \n #{existing_title}")
cursor.close()
conn.close()
return True
if link != existing_link:
similar_d = existing_title
insert_data(title,text,link,embedding,similar_d)
print(f"Similar data found: \n #{title} \n #{existing_title}")
print(f"Inserting: #{title} \n")
similar_d = "NO"
cursor.close()
return True
else:
print(f"Same source of same article!")
cursor.close()
return True
print(f"Inserting: #{title}")
cursor.close()
conn.close()
return False
def insert_data(title, text, link, embedding):
def get_similar():
conn = psycopg2.connect(
host=host,
port=port,
@@ -53,17 +75,35 @@ def insert_data(title, text, link, embedding):
password=password,
dbname=dbname
)
cursor = conn.cursor()
query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
cursor.execute(query)
similar_data = cursor.fetchall()
cursor.close()
return similar_data
def insert_data(title, text, link, embedding, similar_d):
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
c_time = datetime.now()
cursor = conn.cursor()
cursor.execute('''
INSERT INTO vectorsvevijesti (title, text, link, embedding)
VALUES (%s, %s, %s, %s);
''', (title, text, link, embedding))
INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time)
VALUES (%s, %s, %s, %s, %s ,%s);
''', (title, text, link, embedding , similar_d, c_time))
conn.commit()
cursor.close()
conn.close()
def get_data():
conn = psycopg2.connect(
@@ -79,11 +119,110 @@ def get_data():
cursor.execute(query)
data = cursor.fetchall()
cursor.close()
conn.close()
return data
def create_db():
def modify_similar_data(new_value ,title):
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
cursor = conn.cursor()
query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s '''
cursor.execute(query, (new_value, title))
conn.commit()
def get_specific_data(title):
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
cursor = conn.cursor()
query = '''SELECT title, text, link, similar_d, embedding FROM vectorsvevijesti WHERE title = %s'''
cursor.execute(query, (title,))
specific_post = cursor.fetchall()
cursor.close()
return specific_post
def get_all_links():
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
cursor = conn.cursor()
query = '''SELECT link FROM vectorsvevijesti'''
cursor.execute(query)
db_links = {link[0] for link in cursor.fetchall()}
cursor.close()
return db_links
def delete_specific(title):
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
cursor = conn.cursor()
query = '''DELETE FROM vectorsvevijesti WHERE title = %s'''
cursor.execute(query,(title,))
cursor.close()
def cleansing():
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
day_long = datetime.now() - timedelta(days=1)
cursor = conn.cursor()
query = '''DELETE FROM vectorsvevijesti WHERE time < %s'''
cursor.execute(query,(day_long,))
conn.commit()
cursor.close()
def drop_table():
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
cursor = conn.cursor()
query = '''DROP TABLE IF EXISTS vectorsvevijesti;'''
cursor.execute(query)
conn.commit()
cursor.close()
def create_db(conn):
conn = psycopg2.connect(
host=host,
port=port,
@@ -97,19 +236,18 @@ def create_db():
register_vector(conn)
cursor.execute("DROP TABLE IF EXISTS vectorsvevijesti;")
cursor.execute('''
CREATE TABLE vectorsvevijesti (
CREATE TABLE IF NOT EXISTS vectorsvevijesti (
id bigserial PRIMARY KEY,
title VARCHAR,
text VARCHAR,
link VARCHAR,
embedding vector(1536)
embedding vector(1536),
similar_d VARCHAR,
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
''')
conn.commit()
cursor.close()
conn.close()
create_db()
create_db(conn)