250 lines
7.6 KiB
Python
250 lines
7.6 KiB
Python
import psycopg2
|
|
from psycopg2 import sql
|
|
from pgvector.psycopg2 import register_vector
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
import numpy as np
|
|
import os
|
|
from dotenv import load_dotenv
|
|
from datetime import datetime ,timedelta
|
|
|
|
load_dotenv()
|
|
|
|
host = os.getenv("DB_HOST")
|
|
port = os.getenv("DB_PORT")
|
|
user = os.getenv("DB_USER")
|
|
password = os.getenv("DB_PASSWORD")
|
|
dbname = os.getenv("DB_NAME")
|
|
|
|
conn = psycopg2.connect(
|
|
host=host,
|
|
port=port,
|
|
user=user,
|
|
password=password,
|
|
dbname=dbname
|
|
)
|
|
|
|
def calculate_cosine_similarity(v1, v2):
|
|
v1_normalized = v1 / np.linalg.norm(v1)
|
|
v2_normalized = v2 / np.linalg.norm(v2)
|
|
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
|
|
return similarity
|
|
|
|
def parse_embedding_string(embedding_str):
|
|
if isinstance(embedding_str, str):
|
|
numbers = [float(num) for num in embedding_str[1:-1].split(',')]
|
|
return np.array(numbers)
|
|
elif isinstance(embedding_str, np.ndarray):
|
|
return embedding_str
|
|
else:
|
|
raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.")
|
|
|
|
def is_similar_data(title, text, link, embedding, threshold=0.98):
|
|
cursor = conn.cursor()
|
|
cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;')
|
|
existing_embeddings = cursor.fetchall()
|
|
|
|
for existing_embedding_tuple in existing_embeddings:
|
|
existing_title = existing_embedding_tuple[0]
|
|
existing_embedding = np.array(existing_embedding_tuple[1]).flatten()
|
|
existing_link = existing_embedding_tuple[2]
|
|
similarity = calculate_cosine_similarity(existing_embedding, embedding)
|
|
if similarity > threshold:
|
|
if link != existing_link:
|
|
similar_d = existing_title
|
|
insert_data(title,text,link,embedding,similar_d)
|
|
print(f"Similar data found: \n #{title} \n #{existing_title}")
|
|
print(f"Inserting: #{title}")
|
|
similar_d = "NO"
|
|
cursor.close()
|
|
return True
|
|
else:
|
|
print(f"Same article of same source!")
|
|
cursor.close()
|
|
return True
|
|
|
|
print(f"Inserting: #{title}")
|
|
cursor.close()
|
|
return False
|
|
|
|
def get_similar():
|
|
cursor = conn.cursor()
|
|
query = '''SELECT title, link, similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
|
|
cursor.execute(query)
|
|
similar_data = cursor.fetchall()
|
|
cursor.close()
|
|
return similar_data
|
|
|
|
def get_titles_links_embeddings():
|
|
cursor = conn.cursor()
|
|
cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;')
|
|
data = cursor.fetchall()
|
|
cursor.close()
|
|
|
|
titles = [row[0] for row in data]
|
|
links = [row[1] for row in data]
|
|
embeddings = [parse_embedding_string(row[2]) for row in data]
|
|
|
|
return titles, links, embeddings
|
|
|
|
|
|
def insert_data(title, text, link, embedding, similar_d,category):
|
|
c_time = datetime.now()
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready, category)
|
|
VALUES (%s, %s, %s, %s, %s ,%s ,%s ,%s);
|
|
''', (title, text, link, embedding , similar_d, c_time, True , category))
|
|
conn.commit()
|
|
cursor.close()
|
|
|
|
def insert_final(title,text,slug,link,source_id, category):
|
|
with conn.cursor() as cursor:
|
|
cursor.execute('''INSERT INTO articles (title, content, slug, original_url, source_id, category)
|
|
VALUES (%s, %s, %s, %s, %s, %s)ON CONFLICT (original_url) DO NOTHING;''',(title , text, slug, link, source_id, category))
|
|
conn.commit()
|
|
|
|
def get_data():
|
|
cursor = conn.cursor()
|
|
query = '''SELECT title,text,link FROM vectorsvevijesti;'''
|
|
cursor.execute(query)
|
|
data = cursor.fetchall()
|
|
cursor.close()
|
|
return data
|
|
|
|
def get_ready_data():
|
|
cursor = conn.cursor()
|
|
query = '''SELECT title, text, link, time, similar_d, category FROM vectorsvevijesti WHERE ready = %s;'''
|
|
cursor.execute(query, ('True',))
|
|
data = cursor.fetchall()
|
|
cursor.close()
|
|
return data
|
|
|
|
def get_source_data():
|
|
cursor = conn.cursor()
|
|
query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
|
|
cursor.execute(query, ('False',))
|
|
data = cursor.fetchall()
|
|
cursor.close()
|
|
return data
|
|
|
|
def modify_similar_data(new_value ,title):
|
|
cursor = conn.cursor()
|
|
query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s '''
|
|
cursor.execute(query, (new_value, title))
|
|
conn.commit()
|
|
|
|
def preparing_articles(new_value ,title):
|
|
cursor = conn.cursor()
|
|
query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s '''
|
|
cursor.execute(query, (new_value, title))
|
|
conn.commit()
|
|
|
|
def get_specific_data(title):
|
|
cursor = conn.cursor()
|
|
query = '''SELECT title, text, link, similar_d, embedding, category, ready FROM vectorsvevijesti WHERE title = %s'''
|
|
cursor.execute(query, (title,))
|
|
specific_post = cursor.fetchall()
|
|
cursor.close()
|
|
return specific_post
|
|
|
|
def get_all_links():
|
|
cursor = conn.cursor()
|
|
query = '''SELECT link FROM vectorsvevijesti'''
|
|
cursor.execute(query)
|
|
db_links = {link[0] for link in cursor.fetchall()}
|
|
cursor.close()
|
|
return db_links
|
|
|
|
def get_existing_titles():
|
|
cursor = conn.cursor()
|
|
query = '''SELECT title, original_url FROM articles'''
|
|
cursor.execute(query)
|
|
db_links = {link[0] for link in cursor.fetchall()}
|
|
cursor.close()
|
|
return db_links
|
|
|
|
def delete_specific(title):
|
|
cursor = conn.cursor()
|
|
query = '''DELETE FROM vectorsvevijesti WHERE title = %s'''
|
|
cursor.execute(query,(title,))
|
|
cursor.close()
|
|
|
|
def cleansing():
|
|
day_long = datetime.now() - timedelta(days=1)
|
|
cursor = conn.cursor()
|
|
query = '''DELETE FROM vectorsvevijesti WHERE time < %s'''
|
|
cursor.execute(query,(day_long,))
|
|
conn.commit()
|
|
cursor.close()
|
|
|
|
def drop_table():
|
|
cursor = conn.cursor()
|
|
query = '''DROP TABLE IF EXISTS vectorsvevijesti;'''
|
|
cursor.execute(query)
|
|
conn.commit()
|
|
cursor.close()
|
|
|
|
def create_db():
|
|
cursor = conn.cursor()
|
|
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
|
|
register_vector(conn)
|
|
cursor.execute('''
|
|
CREATE TABLE IF NOT EXISTS vectorsvevijesti (
|
|
id bigserial PRIMARY KEY,
|
|
title VARCHAR,
|
|
text VARCHAR,
|
|
link VARCHAR,
|
|
embedding vector(1536),
|
|
similar_d VARCHAR,
|
|
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
ready BOOLEAN
|
|
);
|
|
''')
|
|
conn.commit()
|
|
cursor.close()
|
|
|
|
def create_db():
|
|
cursor = conn.cursor()
|
|
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
|
|
register_vector(conn)
|
|
cursor.execute('''
|
|
CREATE TABLE IF NOT EXISTS vectorsvevijesti (
|
|
id bigserial PRIMARY KEY,
|
|
title VARCHAR,
|
|
text VARCHAR,
|
|
link VARCHAR,
|
|
embedding vector(1536),
|
|
similar_d VARCHAR,
|
|
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
ready BOOLEAN,
|
|
category VARCHAR
|
|
);
|
|
''')
|
|
conn.commit()
|
|
cursor.close()
|
|
|
|
def create_ar_table():
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
CREATE TABLE IF NOT EXISTS "articles" (
|
|
"id" bigserial PRIMARY KEY,
|
|
"title" text NOT NULL UNIQUE,
|
|
"content" text NOT NULL,
|
|
"slug" text NOT NULL UNIQUE,
|
|
"created_at" timestamptz DEFAULT NOW() NOT NULL,
|
|
"original_url" text NOT NULL UNIQUE,
|
|
"source_id" int NOT NULL,
|
|
"category" VARCHAR
|
|
|
|
);
|
|
''')
|
|
conn.commit()
|
|
cursor.close()
|
|
|
|
import psycopg2
|
|
from psycopg2 import sql
|
|
|
|
|
|
create_db()
|
|
create_ar_table()
|