organizing code
This commit is contained in:
168
pyth/vectData.py
168
pyth/vectData.py
@@ -7,7 +7,6 @@ import os
|
||||
from dotenv import load_dotenv
|
||||
from datetime import datetime ,timedelta
|
||||
|
||||
|
||||
load_dotenv()
|
||||
|
||||
host = os.getenv("DB_HOST")
|
||||
@@ -27,20 +26,20 @@ conn = psycopg2.connect(
|
||||
def calculate_cosine_similarity(v1, v2):
|
||||
v1_normalized = v1 / np.linalg.norm(v1)
|
||||
v2_normalized = v2 / np.linalg.norm(v2)
|
||||
|
||||
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
|
||||
return similarity
|
||||
|
||||
def is_similar_data(title, text, link, embedding, threshold=0.98):
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
def parse_embedding_string(embedding_str):
|
||||
if isinstance(embedding_str, str):
|
||||
numbers = [float(num) for num in embedding_str[1:-1].split(',')]
|
||||
return np.array(numbers)
|
||||
elif isinstance(embedding_str, np.ndarray):
|
||||
return embedding_str
|
||||
else:
|
||||
raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.")
|
||||
|
||||
def is_similar_data(title, text, link, embedding, threshold=0.98):
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;')
|
||||
existing_embeddings = cursor.fetchall()
|
||||
|
||||
@@ -54,12 +53,12 @@ def is_similar_data(title, text, link, embedding, threshold=0.98):
|
||||
similar_d = existing_title
|
||||
insert_data(title,text,link,embedding,similar_d)
|
||||
print(f"Similar data found: \n #{title} \n #{existing_title}")
|
||||
print(f"Inserting: #{title} \n")
|
||||
print(f"Inserting: #{title}")
|
||||
similar_d = "NO"
|
||||
cursor.close()
|
||||
return True
|
||||
else:
|
||||
print(f"Same source of same article!")
|
||||
print(f"Same article of same source!")
|
||||
cursor.close()
|
||||
return True
|
||||
|
||||
@@ -68,13 +67,6 @@ def is_similar_data(title, text, link, embedding, threshold=0.98):
|
||||
return False
|
||||
|
||||
def get_similar():
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
|
||||
cursor.execute(query)
|
||||
@@ -82,73 +74,49 @@ def get_similar():
|
||||
cursor.close()
|
||||
return similar_data
|
||||
|
||||
def get_titles_links_embeddings():
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;')
|
||||
data = cursor.fetchall()
|
||||
cursor.close()
|
||||
|
||||
titles = [row[0] for row in data]
|
||||
links = [row[1] for row in data]
|
||||
embeddings = [parse_embedding_string(row[2]) for row in data]
|
||||
|
||||
return titles, links, embeddings
|
||||
|
||||
|
||||
def insert_data(title, text, link, embedding, similar_d):
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
c_time = datetime.now()
|
||||
|
||||
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready)
|
||||
VALUES (%s, %s, %s, %s, %s ,%s ,%s);
|
||||
''', (title, text, link, embedding , similar_d, c_time, True))
|
||||
|
||||
conn.commit()
|
||||
|
||||
cursor.close()
|
||||
|
||||
def get_data():
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT title,text,link FROM vectorsvevijesti;'''
|
||||
|
||||
cursor.execute(query)
|
||||
data = cursor.fetchall()
|
||||
cursor.close()
|
||||
return data
|
||||
|
||||
def get_ready_data():
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
|
||||
|
||||
cursor.execute(query, ('True',))
|
||||
data = cursor.fetchall()
|
||||
cursor.close()
|
||||
return data
|
||||
|
||||
def get_source_data():
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
|
||||
|
||||
cursor.execute(query, ('False',))
|
||||
data = cursor.fetchall()
|
||||
cursor.close()
|
||||
@@ -156,138 +124,60 @@ def get_source_data():
|
||||
|
||||
|
||||
def modify_similar_data(new_value ,title):
|
||||
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s '''
|
||||
|
||||
cursor.execute(query, (new_value, title))
|
||||
|
||||
conn.commit()
|
||||
|
||||
|
||||
def preparing_articles(new_value ,title):
|
||||
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s '''
|
||||
|
||||
cursor.execute(query, (new_value, title))
|
||||
|
||||
conn.commit()
|
||||
|
||||
def get_specific_data(title):
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT title, text, link, similar_d, embedding FROM vectorsvevijesti WHERE title = %s'''
|
||||
query = '''SELECT title, text, link, similar_d, embedding, ready FROM vectorsvevijesti WHERE title = %s'''
|
||||
cursor.execute(query, (title,))
|
||||
|
||||
specific_post = cursor.fetchall()
|
||||
cursor.close()
|
||||
return specific_post
|
||||
|
||||
|
||||
def get_all_links():
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT link FROM vectorsvevijesti'''
|
||||
cursor.execute(query)
|
||||
|
||||
db_links = {link[0] for link in cursor.fetchall()}
|
||||
cursor.close()
|
||||
return db_links
|
||||
|
||||
def delete_specific(title):
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
|
||||
cursor = conn.cursor()
|
||||
query = '''DELETE FROM vectorsvevijesti WHERE title = %s'''
|
||||
|
||||
cursor.execute(query,(title,))
|
||||
cursor.close()
|
||||
|
||||
def cleansing():
|
||||
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
|
||||
day_long = datetime.now() - timedelta(days=1)
|
||||
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = '''DELETE FROM vectorsvevijesti WHERE time < %s'''
|
||||
cursor.execute(query,(day_long,))
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def drop_table():
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = '''DROP TABLE IF EXISTS vectorsvevijesti;'''
|
||||
cursor.execute(query)
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def create_db(conn):
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
def create_db():
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
|
||||
|
||||
register_vector(conn)
|
||||
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS vectorsvevijesti (
|
||||
id bigserial PRIMARY KEY,
|
||||
@@ -298,10 +188,8 @@ def create_db(conn):
|
||||
similar_d VARCHAR,
|
||||
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
ready BOOLEAN
|
||||
|
||||
);
|
||||
''')
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
create_db(conn)
|
||||
create_db()
|
||||
|
||||
Reference in New Issue
Block a user