Combine similar article
This commit is contained in:
190
pyth/vectData.py
190
pyth/vectData.py
@@ -3,12 +3,26 @@ from psycopg2 import sql
|
||||
from pgvector.psycopg2 import register_vector
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
import numpy as np
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from datetime import datetime ,timedelta
|
||||
|
||||
host = 'localhost'
|
||||
port = '5432'
|
||||
user = 'postgres'
|
||||
password = 'salmonela pljusti 221 hamo'
|
||||
dbname = 'vector_svw'
|
||||
|
||||
load_dotenv()
|
||||
|
||||
host = os.getenv("DB_HOST")
|
||||
port = os.getenv("DB_PORT")
|
||||
user = os.getenv("DB_USER")
|
||||
password = os.getenv("DB_PASSWORD")
|
||||
dbname = os.getenv("DB_NAME")
|
||||
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
|
||||
def calculate_cosine_similarity(v1, v2):
|
||||
v1_normalized = v1 / np.linalg.norm(v1)
|
||||
@@ -17,7 +31,7 @@ def calculate_cosine_similarity(v1, v2):
|
||||
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
|
||||
return similarity
|
||||
|
||||
def is_similar_data(title, text, link, embedding, threshold=0.9):
|
||||
def is_similar_data(title, text, link, embedding, threshold=0.98):
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
@@ -27,25 +41,33 @@ def is_similar_data(title, text, link, embedding, threshold=0.9):
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('SELECT title ,embedding FROM vectorsvevijesti;')
|
||||
cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;')
|
||||
existing_embeddings = cursor.fetchall()
|
||||
|
||||
for existing_embedding_tuple in existing_embeddings:
|
||||
existing_title = existing_embedding_tuple[0]
|
||||
existing_embedding = np.array(existing_embedding_tuple[1]).flatten()
|
||||
existing_link = existing_embedding_tuple[2]
|
||||
similarity = calculate_cosine_similarity(existing_embedding, embedding)
|
||||
if similarity > threshold:
|
||||
print(f"Similar data found: \n #{title} \n #{existing_title}")
|
||||
cursor.close()
|
||||
conn.close()
|
||||
return True
|
||||
if link != existing_link:
|
||||
similar_d = existing_title
|
||||
insert_data(title,text,link,embedding,similar_d)
|
||||
print(f"Similar data found: \n #{title} \n #{existing_title}")
|
||||
print(f"Inserting: #{title} \n")
|
||||
similar_d = "NO"
|
||||
cursor.close()
|
||||
return True
|
||||
else:
|
||||
print(f"Same source of same article!")
|
||||
cursor.close()
|
||||
return True
|
||||
|
||||
print(f"Inserting: #{title}")
|
||||
cursor.close()
|
||||
conn.close()
|
||||
return False
|
||||
|
||||
def insert_data(title, text, link, embedding):
|
||||
def get_similar():
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
@@ -53,17 +75,35 @@ def insert_data(title, text, link, embedding):
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
|
||||
cursor.execute(query)
|
||||
similar_data = cursor.fetchall()
|
||||
cursor.close()
|
||||
return similar_data
|
||||
|
||||
|
||||
def insert_data(title, text, link, embedding, similar_d):
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
c_time = datetime.now()
|
||||
|
||||
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
INSERT INTO vectorsvevijesti (title, text, link, embedding)
|
||||
VALUES (%s, %s, %s, %s);
|
||||
''', (title, text, link, embedding))
|
||||
INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time)
|
||||
VALUES (%s, %s, %s, %s, %s ,%s);
|
||||
''', (title, text, link, embedding , similar_d, c_time))
|
||||
|
||||
conn.commit()
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def get_data():
|
||||
conn = psycopg2.connect(
|
||||
@@ -79,11 +119,110 @@ def get_data():
|
||||
cursor.execute(query)
|
||||
data = cursor.fetchall()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
return data
|
||||
|
||||
def create_db():
|
||||
def modify_similar_data(new_value ,title):
|
||||
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s '''
|
||||
|
||||
cursor.execute(query, (new_value, title))
|
||||
|
||||
conn.commit()
|
||||
|
||||
def get_specific_data(title):
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT title, text, link, similar_d, embedding FROM vectorsvevijesti WHERE title = %s'''
|
||||
cursor.execute(query, (title,))
|
||||
|
||||
specific_post = cursor.fetchall()
|
||||
cursor.close()
|
||||
return specific_post
|
||||
|
||||
def get_all_links():
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT link FROM vectorsvevijesti'''
|
||||
cursor.execute(query)
|
||||
|
||||
db_links = {link[0] for link in cursor.fetchall()}
|
||||
cursor.close()
|
||||
return db_links
|
||||
|
||||
def delete_specific(title):
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
|
||||
cursor = conn.cursor()
|
||||
query = '''DELETE FROM vectorsvevijesti WHERE title = %s'''
|
||||
|
||||
cursor.execute(query,(title,))
|
||||
cursor.close()
|
||||
|
||||
def cleansing():
|
||||
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
|
||||
day_long = datetime.now() - timedelta(days=1)
|
||||
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = '''DELETE FROM vectorsvevijesti WHERE time < %s'''
|
||||
cursor.execute(query,(day_long,))
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def drop_table():
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = '''DROP TABLE IF EXISTS vectorsvevijesti;'''
|
||||
cursor.execute(query)
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def create_db(conn):
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
@@ -97,19 +236,18 @@ def create_db():
|
||||
|
||||
register_vector(conn)
|
||||
|
||||
cursor.execute("DROP TABLE IF EXISTS vectorsvevijesti;")
|
||||
|
||||
cursor.execute('''
|
||||
CREATE TABLE vectorsvevijesti (
|
||||
CREATE TABLE IF NOT EXISTS vectorsvevijesti (
|
||||
id bigserial PRIMARY KEY,
|
||||
title VARCHAR,
|
||||
text VARCHAR,
|
||||
link VARCHAR,
|
||||
embedding vector(1536)
|
||||
embedding vector(1536),
|
||||
similar_d VARCHAR,
|
||||
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
''')
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
create_db()
|
||||
create_db(conn)
|
||||
|
||||
Reference in New Issue
Block a user