Files
old-svevijesti/pyth/db_management.py
2024-01-29 14:55:20 +01:00

250 lines
7.6 KiB
Python

import psycopg2
from psycopg2 import sql
from pgvector.psycopg2 import register_vector
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
from dotenv import load_dotenv
from datetime import datetime ,timedelta
load_dotenv()
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
dbname = os.getenv("DB_NAME")
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
def calculate_cosine_similarity(v1, v2):
v1_normalized = v1 / np.linalg.norm(v1)
v2_normalized = v2 / np.linalg.norm(v2)
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
return similarity
def parse_embedding_string(embedding_str):
if isinstance(embedding_str, str):
numbers = [float(num) for num in embedding_str[1:-1].split(',')]
return np.array(numbers)
elif isinstance(embedding_str, np.ndarray):
return embedding_str
else:
raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.")
def is_similar_data(title, text, link, embedding, threshold=0.98):
cursor = conn.cursor()
cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;')
existing_embeddings = cursor.fetchall()
for existing_embedding_tuple in existing_embeddings:
existing_title = existing_embedding_tuple[0]
existing_embedding = np.array(existing_embedding_tuple[1]).flatten()
existing_link = existing_embedding_tuple[2]
similarity = calculate_cosine_similarity(existing_embedding, embedding)
if similarity > threshold:
if link != existing_link:
similar_d = existing_title
insert_data(title,text,link,embedding,similar_d)
print(f"Similar data found: \n #{title} \n #{existing_title}")
print(f"Inserting: #{title}")
similar_d = "NO"
cursor.close()
return True
else:
print(f"Same article of same source!")
cursor.close()
return True
print(f"Inserting: #{title}")
cursor.close()
return False
def get_similar():
cursor = conn.cursor()
query = '''SELECT title, link, similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
cursor.execute(query)
similar_data = cursor.fetchall()
cursor.close()
return similar_data
def get_titles_links_embeddings():
cursor = conn.cursor()
cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;')
data = cursor.fetchall()
cursor.close()
titles = [row[0] for row in data]
links = [row[1] for row in data]
embeddings = [parse_embedding_string(row[2]) for row in data]
return titles, links, embeddings
def insert_data(title, text, link, embedding, similar_d,category):
c_time = datetime.now()
cursor = conn.cursor()
cursor.execute('''
INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready, category)
VALUES (%s, %s, %s, %s, %s ,%s ,%s ,%s);
''', (title, text, link, embedding , similar_d, c_time, True , category))
conn.commit()
cursor.close()
def insert_final(title,text,slug,link,source_id, category):
with conn.cursor() as cursor:
cursor.execute('''INSERT INTO articles (title, content, slug, original_url, source_id, category)
VALUES (%s, %s, %s, %s, %s, %s)ON CONFLICT (original_url) DO NOTHING;''',(title , text, slug, link, source_id, category))
conn.commit()
def get_data():
cursor = conn.cursor()
query = '''SELECT title,text,link FROM vectorsvevijesti;'''
cursor.execute(query)
data = cursor.fetchall()
cursor.close()
return data
def get_ready_data():
cursor = conn.cursor()
query = '''SELECT title, text, link, time, similar_d, category FROM vectorsvevijesti WHERE ready = %s;'''
cursor.execute(query, ('True',))
data = cursor.fetchall()
cursor.close()
return data
def get_source_data():
cursor = conn.cursor()
query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
cursor.execute(query, ('False',))
data = cursor.fetchall()
cursor.close()
return data
def modify_similar_data(new_value ,title):
cursor = conn.cursor()
query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s '''
cursor.execute(query, (new_value, title))
conn.commit()
def preparing_articles(new_value ,title):
cursor = conn.cursor()
query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s '''
cursor.execute(query, (new_value, title))
conn.commit()
def get_specific_data(title):
cursor = conn.cursor()
query = '''SELECT title, text, link, similar_d, embedding, category, ready FROM vectorsvevijesti WHERE title = %s'''
cursor.execute(query, (title,))
specific_post = cursor.fetchall()
cursor.close()
return specific_post
def get_all_links():
cursor = conn.cursor()
query = '''SELECT link FROM vectorsvevijesti'''
cursor.execute(query)
db_links = {link[0] for link in cursor.fetchall()}
cursor.close()
return db_links
def get_existing_titles():
cursor = conn.cursor()
query = '''SELECT title, original_url FROM articles'''
cursor.execute(query)
db_links = {link[0] for link in cursor.fetchall()}
cursor.close()
return db_links
def delete_specific(title):
cursor = conn.cursor()
query = '''DELETE FROM vectorsvevijesti WHERE title = %s'''
cursor.execute(query,(title,))
cursor.close()
def cleansing():
day_long = datetime.now() - timedelta(days=1)
cursor = conn.cursor()
query = '''DELETE FROM vectorsvevijesti WHERE time < %s'''
cursor.execute(query,(day_long,))
conn.commit()
cursor.close()
def drop_table():
cursor = conn.cursor()
query = '''DROP TABLE IF EXISTS vectorsvevijesti;'''
cursor.execute(query)
conn.commit()
cursor.close()
def create_db():
cursor = conn.cursor()
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
register_vector(conn)
cursor.execute('''
CREATE TABLE IF NOT EXISTS vectorsvevijesti (
id bigserial PRIMARY KEY,
title VARCHAR,
text VARCHAR,
link VARCHAR,
embedding vector(1536),
similar_d VARCHAR,
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
ready BOOLEAN
);
''')
conn.commit()
cursor.close()
def create_db():
cursor = conn.cursor()
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
register_vector(conn)
cursor.execute('''
CREATE TABLE IF NOT EXISTS vectorsvevijesti (
id bigserial PRIMARY KEY,
title VARCHAR,
text VARCHAR,
link VARCHAR,
embedding vector(1536),
similar_d VARCHAR,
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
ready BOOLEAN,
category VARCHAR
);
''')
conn.commit()
cursor.close()
def create_ar_table():
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS "articles" (
"id" bigserial PRIMARY KEY,
"title" text NOT NULL UNIQUE,
"content" text NOT NULL,
"slug" text NOT NULL UNIQUE,
"created_at" timestamptz DEFAULT NOW() NOT NULL,
"original_url" text NOT NULL UNIQUE,
"source_id" int NOT NULL,
"category" VARCHAR
);
''')
conn.commit()
cursor.close()
import psycopg2
from psycopg2 import sql
create_db()
create_ar_table()