organizing code
This commit is contained in:
Binary file not shown.
Binary file not shown.
173
pyth/articles.py
173
pyth/articles.py
@@ -1,12 +1,10 @@
|
|||||||
import psycopg2
|
import psycopg2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import os
|
import os
|
||||||
from openai import OpenAI , APIError
|
from openai import OpenAI
|
||||||
from langchain.embeddings import OpenAIEmbeddings
|
from langchain.embeddings import OpenAIEmbeddings
|
||||||
from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, get_source_data, get_ready_data
|
from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings
|
||||||
import tiktoken
|
|
||||||
from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens
|
from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens
|
||||||
import json
|
import json
|
||||||
|
|
||||||
@@ -18,80 +16,30 @@ embeddings = OpenAIEmbeddings()
|
|||||||
|
|
||||||
print(f"Checking for similar!")
|
print(f"Checking for similar!")
|
||||||
|
|
||||||
host = os.getenv("DB_HOST")
|
|
||||||
port = os.getenv("DB_PORT")
|
|
||||||
user = os.getenv("DB_USER")
|
|
||||||
password = os.getenv("DB_PASSWORD")
|
|
||||||
dbname = os.getenv("DB_NAME")
|
|
||||||
|
|
||||||
def calculate_cosine_similarity(v1, v2):
|
|
||||||
v1_normalized = v1 / np.linalg.norm(v1)
|
|
||||||
v2_normalized = v2 / np.linalg.norm(v2)
|
|
||||||
|
|
||||||
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
|
|
||||||
return similarity
|
|
||||||
|
|
||||||
def parse_embedding_string(embedding_str):
|
|
||||||
if isinstance(embedding_str, str):
|
|
||||||
numbers = [float(num) for num in embedding_str[1:-1].split(',')]
|
|
||||||
return np.array(numbers)
|
|
||||||
elif isinstance(embedding_str, np.ndarray):
|
|
||||||
return embedding_str
|
|
||||||
else:
|
|
||||||
raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.")
|
|
||||||
|
|
||||||
|
|
||||||
def get_titles_links_embeddings():
|
|
||||||
conn = psycopg2.connect(
|
|
||||||
host=host,
|
|
||||||
port=port,
|
|
||||||
user=user,
|
|
||||||
password=password,
|
|
||||||
dbname=dbname
|
|
||||||
)
|
|
||||||
cursor = conn.cursor()
|
|
||||||
cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;')
|
|
||||||
data = cursor.fetchall()
|
|
||||||
cursor.close()
|
|
||||||
|
|
||||||
titles = [row[0] for row in data]
|
|
||||||
links = [row[1] for row in data]
|
|
||||||
embeddings = [parse_embedding_string(row[2]) for row in data]
|
|
||||||
|
|
||||||
return titles, links, embeddings
|
|
||||||
|
|
||||||
def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
|
def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
|
||||||
try:
|
try:
|
||||||
conn = psycopg2.connect(
|
titles, links, embeddings = get_titles_links_embeddings()
|
||||||
host=host,
|
|
||||||
port=port,
|
|
||||||
user=user,
|
|
||||||
password=password,
|
|
||||||
dbname=dbname
|
|
||||||
)
|
|
||||||
|
|
||||||
with conn, conn.cursor() as cursor:
|
processed_articles = set()
|
||||||
titles, links, embeddings = get_titles_links_embeddings()
|
grouped_similar_articles = []
|
||||||
|
|
||||||
processed_articles = set()
|
for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
|
||||||
grouped_similar_articles = []
|
if (title1, link1) not in processed_articles:
|
||||||
|
processed_articles.add((title1, link1))
|
||||||
|
group = [(title1, link1)]
|
||||||
|
|
||||||
for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
|
for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
|
||||||
if (title1, link1) not in processed_articles:
|
if i != j and (title2, link2) not in processed_articles:
|
||||||
processed_articles.add((title1, link1))
|
similarity = calculate_cosine_similarity(embedding1, embedding2)
|
||||||
group = [(title1, link1)]
|
|
||||||
|
|
||||||
for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
|
if similarity > threshold:
|
||||||
if i != j and (title2, link2) not in processed_articles:
|
processed_articles.add((title2, link2))
|
||||||
similarity = calculate_cosine_similarity(embedding1, embedding2)
|
group.append((title2, link2))
|
||||||
|
|
||||||
if similarity > threshold:
|
grouped_similar_articles.append(group)
|
||||||
processed_articles.add((title2, link2))
|
|
||||||
group.append((title2, link2))
|
|
||||||
|
|
||||||
grouped_similar_articles.append(group)
|
return grouped_similar_articles
|
||||||
|
|
||||||
return grouped_similar_articles
|
|
||||||
|
|
||||||
except psycopg2.Error as e:
|
except psycopg2.Error as e:
|
||||||
print(f"Error: {e}")
|
print(f"Error: {e}")
|
||||||
@@ -101,7 +49,6 @@ def processing_similar():
|
|||||||
grouped_similar_articles_result = find_and_group_similar_articles()
|
grouped_similar_articles_result = find_and_group_similar_articles()
|
||||||
|
|
||||||
if grouped_similar_articles_result:
|
if grouped_similar_articles_result:
|
||||||
|
|
||||||
for group in grouped_similar_articles_result:
|
for group in grouped_similar_articles_result:
|
||||||
articles = []
|
articles = []
|
||||||
|
|
||||||
@@ -112,8 +59,8 @@ def processing_similar():
|
|||||||
article = [title, link]
|
article = [title, link]
|
||||||
articles.append(article)
|
articles.append(article)
|
||||||
l = len(articles)
|
l = len(articles)
|
||||||
|
|
||||||
if l == 2:
|
if l == 2:
|
||||||
print("2")
|
|
||||||
a_one = articles[0][0]
|
a_one = articles[0][0]
|
||||||
a_two = articles[1][0]
|
a_two = articles[1][0]
|
||||||
|
|
||||||
@@ -141,7 +88,6 @@ def processing_similar():
|
|||||||
modify_similar_data(similar_d, a_two)
|
modify_similar_data(similar_d, a_two)
|
||||||
preparing_articles(False, a_two)
|
preparing_articles(False, a_two)
|
||||||
|
|
||||||
print(tokens)
|
|
||||||
if tokens > 2000:
|
if tokens > 2000:
|
||||||
combined_text = f"{text1} {text2}"
|
combined_text = f"{text1} {text2}"
|
||||||
combined_text = slice_text_at_2k_tokens(combined_text)
|
combined_text = slice_text_at_2k_tokens(combined_text)
|
||||||
@@ -150,7 +96,6 @@ def processing_similar():
|
|||||||
user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
|
user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
|
||||||
|
|
||||||
if l == 3:
|
if l == 3:
|
||||||
print("3")
|
|
||||||
a_one = articles[0][0]
|
a_one = articles[0][0]
|
||||||
a_two = articles[1][0]
|
a_two = articles[1][0]
|
||||||
a_three = articles[2][0]
|
a_three = articles[2][0]
|
||||||
@@ -190,13 +135,82 @@ def processing_similar():
|
|||||||
modify_similar_data(similar_d, a_three)
|
modify_similar_data(similar_d, a_three)
|
||||||
preparing_articles(False, a_three)
|
preparing_articles(False, a_three)
|
||||||
|
|
||||||
print(tokens)
|
|
||||||
if tokens > 2000:
|
if tokens > 2000:
|
||||||
combined_text = f"{text1} {text2} {text3}"
|
combined_text = f"{text1} {text2} {text3}"
|
||||||
combined_text = slice_text_at_2k_tokens(combined_text)
|
combined_text = slice_text_at_2k_tokens(combined_text)
|
||||||
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
|
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
|
||||||
else:
|
else:
|
||||||
user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
|
user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
|
||||||
|
if l == 4:
|
||||||
|
print("4")
|
||||||
|
a_one = articles[0][0]
|
||||||
|
a_two = articles[1][0]
|
||||||
|
a_three = articles[2][0]
|
||||||
|
a_four = articles[3][0]
|
||||||
|
|
||||||
|
get_one = get_specific_data(a_one)
|
||||||
|
get_two = get_specific_data(a_two)
|
||||||
|
get_three = get_specific_data(a_three)
|
||||||
|
get_four = get_specific_data(a_four)
|
||||||
|
|
||||||
|
text1 = get_one[0][1]
|
||||||
|
text2 = get_two[0][1]
|
||||||
|
text3 = get_three[0][1]
|
||||||
|
text4 = get_four[0][1]
|
||||||
|
link1 = get_one[0][2]
|
||||||
|
link2 = get_two[0][2]
|
||||||
|
link3 = get_three[0][2]
|
||||||
|
link4 = get_four[0][2]
|
||||||
|
|
||||||
|
if link1 != link2:
|
||||||
|
if link2 != link3:
|
||||||
|
if link3 != link4:
|
||||||
|
link = f"{link1}, {link2}, {link3}, {link4}"
|
||||||
|
else:
|
||||||
|
link = f"{link1}, {link2}, {link3}"
|
||||||
|
else:
|
||||||
|
if link3 != link4:
|
||||||
|
link = f"{link1}, {link2}, {link4}"
|
||||||
|
else:
|
||||||
|
link = f"{link1}, {link2}"
|
||||||
|
else:
|
||||||
|
if link2 != link3:
|
||||||
|
if link3 != link4:
|
||||||
|
link = f"{link1}, {link3}, {link4}"
|
||||||
|
else:
|
||||||
|
link = f"{link1}, {link3}"
|
||||||
|
else:
|
||||||
|
if link3 != link4:
|
||||||
|
link = f"{link1}, {link4}"
|
||||||
|
else:
|
||||||
|
link = link1
|
||||||
|
|
||||||
|
ftoks = num_tokens_from_string(text1)
|
||||||
|
stoks = num_tokens_from_string(text2)
|
||||||
|
ttoks = num_tokens_from_string(text3)
|
||||||
|
frtoks = num_tokens_from_string(text4)
|
||||||
|
|
||||||
|
tokens = ftoks + stoks + ttoks + frtoks
|
||||||
|
|
||||||
|
similar_d = f"C: {a_one}, {a_two}, {a_three}, {a_four}"
|
||||||
|
modify_similar_data(similar_d, a_one)
|
||||||
|
preparing_articles(False, a_one)
|
||||||
|
|
||||||
|
modify_similar_data(similar_d, a_two)
|
||||||
|
preparing_articles(False, a_two)
|
||||||
|
|
||||||
|
modify_similar_data(similar_d, a_three)
|
||||||
|
preparing_articles(False, a_three)
|
||||||
|
|
||||||
|
modify_similar_data(similar_d, a_four)
|
||||||
|
preparing_articles(False, a_four)
|
||||||
|
|
||||||
|
if tokens > 2000:
|
||||||
|
combined_text = f"{text1} {text2} {text3} {text4}"
|
||||||
|
combined_text = slice_text_at_2k_tokens(combined_text)
|
||||||
|
user_message = rf"Here is text {combined_text}, combined from 4 sources, filter text, and make news content, return as JSON only with a single 'content' field"
|
||||||
|
else:
|
||||||
|
user_message = rf"Here are 4 texts {text1} {text2} {text3} and {text4}, combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single 'content' field."
|
||||||
try:
|
try:
|
||||||
completion = client.chat.completions.create(
|
completion = client.chat.completions.create(
|
||||||
model="gpt-3.5-turbo",
|
model="gpt-3.5-turbo",
|
||||||
@@ -216,16 +230,11 @@ def processing_similar():
|
|||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error: {e}")
|
print(f"Error: {e}")
|
||||||
print(f"Title: {a_one}")
|
print(a_one)
|
||||||
print(f"Answer: {generated_text}")
|
|
||||||
continue
|
continue
|
||||||
|
else:
|
||||||
|
print("Done!.")
|
||||||
else:
|
else:
|
||||||
print("No similar articles found.")
|
print("No similar articles found.")
|
||||||
if __name__=="__main__":
|
if __name__=="__main__":
|
||||||
processing_similar()
|
processing_similar()
|
||||||
ready = get_ready_data()
|
|
||||||
if ready:
|
|
||||||
for a in ready:
|
|
||||||
print(f"Title: {a[0]}")
|
|
||||||
print(f"Link: {a[2]}")
|
|
||||||
print(f"Status: {a[3]}")
|
|
||||||
@@ -1,10 +1,10 @@
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import requests
|
import requests
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
from openai import OpenAI , APIError
|
from openai import OpenAI
|
||||||
import os
|
import os
|
||||||
from langchain.embeddings import OpenAIEmbeddings
|
from langchain.embeddings import OpenAIEmbeddings
|
||||||
from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data,get_all_links,cleansing ,modify_similar_data)
|
from vectData import (insert_data ,is_similar_data ,get_all_links,cleansing )
|
||||||
import json
|
import json
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import tiktoken
|
import tiktoken
|
||||||
@@ -39,7 +39,7 @@ def slice_text_at_2k_tokens(text):
|
|||||||
|
|
||||||
sliced_tokens = tokens[:max_tokens]
|
sliced_tokens = tokens[:max_tokens]
|
||||||
sliced_text = encoding.decode(sliced_tokens)
|
sliced_text = encoding.decode(sliced_tokens)
|
||||||
|
|
||||||
return sliced_text
|
return sliced_text
|
||||||
|
|
||||||
|
|
||||||
@@ -82,7 +82,6 @@ def get_article_links(url, already_checked):
|
|||||||
return link_store
|
return link_store
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
already_checked = set()
|
already_checked = set()
|
||||||
|
|
||||||
for dlink in dlinks:
|
for dlink in dlinks:
|
||||||
@@ -116,8 +115,6 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
title_text = replace_with_spaces(title_text)
|
title_text = replace_with_spaces(title_text)
|
||||||
|
|
||||||
|
|
||||||
print(f"Tokens usage: {num_tokens_from_string(text_text, 'gpt-3.5-turbo')}")
|
|
||||||
text_text = slice_text_at_2k_tokens(text_text)
|
text_text = slice_text_at_2k_tokens(text_text)
|
||||||
text_text = replace_with_spaces(str(text_text))
|
text_text = replace_with_spaces(str(text_text))
|
||||||
|
|
||||||
@@ -138,13 +135,6 @@ if __name__ == '__main__':
|
|||||||
title = response_data["title"]
|
title = response_data["title"]
|
||||||
text = response_data["content"]
|
text = response_data["content"]
|
||||||
|
|
||||||
#print("*********************************")
|
|
||||||
#print(f"Title: {title}")
|
|
||||||
#print("---------------------------------")
|
|
||||||
#print(f"Content : {text}")
|
|
||||||
#print("*********************************")
|
|
||||||
|
|
||||||
|
|
||||||
vector = embeddings.embed_query(generated_text)
|
vector = embeddings.embed_query(generated_text)
|
||||||
|
|
||||||
if not is_similar_data(title, text, link, vector, threshold=0.98):
|
if not is_similar_data(title, text, link, vector, threshold=0.98):
|
||||||
|
|||||||
168
pyth/vectData.py
168
pyth/vectData.py
@@ -7,7 +7,6 @@ import os
|
|||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from datetime import datetime ,timedelta
|
from datetime import datetime ,timedelta
|
||||||
|
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
host = os.getenv("DB_HOST")
|
host = os.getenv("DB_HOST")
|
||||||
@@ -27,20 +26,20 @@ conn = psycopg2.connect(
|
|||||||
def calculate_cosine_similarity(v1, v2):
|
def calculate_cosine_similarity(v1, v2):
|
||||||
v1_normalized = v1 / np.linalg.norm(v1)
|
v1_normalized = v1 / np.linalg.norm(v1)
|
||||||
v2_normalized = v2 / np.linalg.norm(v2)
|
v2_normalized = v2 / np.linalg.norm(v2)
|
||||||
|
|
||||||
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
|
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
|
||||||
return similarity
|
return similarity
|
||||||
|
|
||||||
def is_similar_data(title, text, link, embedding, threshold=0.98):
|
def parse_embedding_string(embedding_str):
|
||||||
conn = psycopg2.connect(
|
if isinstance(embedding_str, str):
|
||||||
host=host,
|
numbers = [float(num) for num in embedding_str[1:-1].split(',')]
|
||||||
port=port,
|
return np.array(numbers)
|
||||||
user=user,
|
elif isinstance(embedding_str, np.ndarray):
|
||||||
password=password,
|
return embedding_str
|
||||||
dbname=dbname
|
else:
|
||||||
)
|
raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.")
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
|
def is_similar_data(title, text, link, embedding, threshold=0.98):
|
||||||
|
cursor = conn.cursor()
|
||||||
cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;')
|
cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;')
|
||||||
existing_embeddings = cursor.fetchall()
|
existing_embeddings = cursor.fetchall()
|
||||||
|
|
||||||
@@ -54,12 +53,12 @@ def is_similar_data(title, text, link, embedding, threshold=0.98):
|
|||||||
similar_d = existing_title
|
similar_d = existing_title
|
||||||
insert_data(title,text,link,embedding,similar_d)
|
insert_data(title,text,link,embedding,similar_d)
|
||||||
print(f"Similar data found: \n #{title} \n #{existing_title}")
|
print(f"Similar data found: \n #{title} \n #{existing_title}")
|
||||||
print(f"Inserting: #{title} \n")
|
print(f"Inserting: #{title}")
|
||||||
similar_d = "NO"
|
similar_d = "NO"
|
||||||
cursor.close()
|
cursor.close()
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
print(f"Same source of same article!")
|
print(f"Same article of same source!")
|
||||||
cursor.close()
|
cursor.close()
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@@ -68,13 +67,6 @@ def is_similar_data(title, text, link, embedding, threshold=0.98):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def get_similar():
|
def get_similar():
|
||||||
conn = psycopg2.connect(
|
|
||||||
host=host,
|
|
||||||
port=port,
|
|
||||||
user=user,
|
|
||||||
password=password,
|
|
||||||
dbname=dbname
|
|
||||||
)
|
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
|
query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
|
||||||
cursor.execute(query)
|
cursor.execute(query)
|
||||||
@@ -82,73 +74,49 @@ def get_similar():
|
|||||||
cursor.close()
|
cursor.close()
|
||||||
return similar_data
|
return similar_data
|
||||||
|
|
||||||
|
def get_titles_links_embeddings():
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;')
|
||||||
|
data = cursor.fetchall()
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
titles = [row[0] for row in data]
|
||||||
|
links = [row[1] for row in data]
|
||||||
|
embeddings = [parse_embedding_string(row[2]) for row in data]
|
||||||
|
|
||||||
|
return titles, links, embeddings
|
||||||
|
|
||||||
|
|
||||||
def insert_data(title, text, link, embedding, similar_d):
|
def insert_data(title, text, link, embedding, similar_d):
|
||||||
conn = psycopg2.connect(
|
|
||||||
host=host,
|
|
||||||
port=port,
|
|
||||||
user=user,
|
|
||||||
password=password,
|
|
||||||
dbname=dbname
|
|
||||||
)
|
|
||||||
c_time = datetime.now()
|
c_time = datetime.now()
|
||||||
|
|
||||||
|
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready)
|
INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready)
|
||||||
VALUES (%s, %s, %s, %s, %s ,%s ,%s);
|
VALUES (%s, %s, %s, %s, %s ,%s ,%s);
|
||||||
''', (title, text, link, embedding , similar_d, c_time, True))
|
''', (title, text, link, embedding , similar_d, c_time, True))
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
|
||||||
def get_data():
|
def get_data():
|
||||||
conn = psycopg2.connect(
|
|
||||||
host=host,
|
|
||||||
port=port,
|
|
||||||
user=user,
|
|
||||||
password=password,
|
|
||||||
dbname=dbname
|
|
||||||
)
|
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
query = '''SELECT title,text,link FROM vectorsvevijesti;'''
|
query = '''SELECT title,text,link FROM vectorsvevijesti;'''
|
||||||
|
|
||||||
cursor.execute(query)
|
cursor.execute(query)
|
||||||
data = cursor.fetchall()
|
data = cursor.fetchall()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def get_ready_data():
|
def get_ready_data():
|
||||||
conn = psycopg2.connect(
|
|
||||||
host=host,
|
|
||||||
port=port,
|
|
||||||
user=user,
|
|
||||||
password=password,
|
|
||||||
dbname=dbname
|
|
||||||
)
|
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
|
query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
|
||||||
|
|
||||||
cursor.execute(query, ('True',))
|
cursor.execute(query, ('True',))
|
||||||
data = cursor.fetchall()
|
data = cursor.fetchall()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def get_source_data():
|
def get_source_data():
|
||||||
conn = psycopg2.connect(
|
|
||||||
host=host,
|
|
||||||
port=port,
|
|
||||||
user=user,
|
|
||||||
password=password,
|
|
||||||
dbname=dbname
|
|
||||||
)
|
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
|
query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
|
||||||
|
|
||||||
cursor.execute(query, ('False',))
|
cursor.execute(query, ('False',))
|
||||||
data = cursor.fetchall()
|
data = cursor.fetchall()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
@@ -156,138 +124,60 @@ def get_source_data():
|
|||||||
|
|
||||||
|
|
||||||
def modify_similar_data(new_value ,title):
|
def modify_similar_data(new_value ,title):
|
||||||
|
|
||||||
conn = psycopg2.connect(
|
|
||||||
host=host,
|
|
||||||
port=port,
|
|
||||||
user=user,
|
|
||||||
password=password,
|
|
||||||
dbname=dbname
|
|
||||||
)
|
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s '''
|
query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s '''
|
||||||
|
|
||||||
cursor.execute(query, (new_value, title))
|
cursor.execute(query, (new_value, title))
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def preparing_articles(new_value ,title):
|
def preparing_articles(new_value ,title):
|
||||||
|
|
||||||
conn = psycopg2.connect(
|
|
||||||
host=host,
|
|
||||||
port=port,
|
|
||||||
user=user,
|
|
||||||
password=password,
|
|
||||||
dbname=dbname
|
|
||||||
)
|
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s '''
|
query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s '''
|
||||||
|
|
||||||
cursor.execute(query, (new_value, title))
|
cursor.execute(query, (new_value, title))
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
def get_specific_data(title):
|
def get_specific_data(title):
|
||||||
conn = psycopg2.connect(
|
|
||||||
host=host,
|
|
||||||
port=port,
|
|
||||||
user=user,
|
|
||||||
password=password,
|
|
||||||
dbname=dbname
|
|
||||||
)
|
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
query = '''SELECT title, text, link, similar_d, embedding FROM vectorsvevijesti WHERE title = %s'''
|
query = '''SELECT title, text, link, similar_d, embedding, ready FROM vectorsvevijesti WHERE title = %s'''
|
||||||
cursor.execute(query, (title,))
|
cursor.execute(query, (title,))
|
||||||
|
|
||||||
specific_post = cursor.fetchall()
|
specific_post = cursor.fetchall()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
return specific_post
|
return specific_post
|
||||||
|
|
||||||
|
|
||||||
def get_all_links():
|
def get_all_links():
|
||||||
conn = psycopg2.connect(
|
|
||||||
host=host,
|
|
||||||
port=port,
|
|
||||||
user=user,
|
|
||||||
password=password,
|
|
||||||
dbname=dbname
|
|
||||||
)
|
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
query = '''SELECT link FROM vectorsvevijesti'''
|
query = '''SELECT link FROM vectorsvevijesti'''
|
||||||
cursor.execute(query)
|
cursor.execute(query)
|
||||||
|
|
||||||
db_links = {link[0] for link in cursor.fetchall()}
|
db_links = {link[0] for link in cursor.fetchall()}
|
||||||
cursor.close()
|
cursor.close()
|
||||||
return db_links
|
return db_links
|
||||||
|
|
||||||
def delete_specific(title):
|
def delete_specific(title):
|
||||||
conn = psycopg2.connect(
|
|
||||||
host=host,
|
|
||||||
port=port,
|
|
||||||
user=user,
|
|
||||||
password=password,
|
|
||||||
dbname=dbname
|
|
||||||
)
|
|
||||||
|
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
query = '''DELETE FROM vectorsvevijesti WHERE title = %s'''
|
query = '''DELETE FROM vectorsvevijesti WHERE title = %s'''
|
||||||
|
|
||||||
cursor.execute(query,(title,))
|
cursor.execute(query,(title,))
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
|
||||||
def cleansing():
|
def cleansing():
|
||||||
|
|
||||||
conn = psycopg2.connect(
|
|
||||||
host=host,
|
|
||||||
port=port,
|
|
||||||
user=user,
|
|
||||||
password=password,
|
|
||||||
dbname=dbname
|
|
||||||
)
|
|
||||||
|
|
||||||
day_long = datetime.now() - timedelta(days=1)
|
day_long = datetime.now() - timedelta(days=1)
|
||||||
|
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
query = '''DELETE FROM vectorsvevijesti WHERE time < %s'''
|
query = '''DELETE FROM vectorsvevijesti WHERE time < %s'''
|
||||||
cursor.execute(query,(day_long,))
|
cursor.execute(query,(day_long,))
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
|
||||||
def drop_table():
|
def drop_table():
|
||||||
conn = psycopg2.connect(
|
|
||||||
host=host,
|
|
||||||
port=port,
|
|
||||||
user=user,
|
|
||||||
password=password,
|
|
||||||
dbname=dbname
|
|
||||||
)
|
|
||||||
|
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
query = '''DROP TABLE IF EXISTS vectorsvevijesti;'''
|
query = '''DROP TABLE IF EXISTS vectorsvevijesti;'''
|
||||||
cursor.execute(query)
|
cursor.execute(query)
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
|
||||||
def create_db(conn):
|
def create_db():
|
||||||
conn = psycopg2.connect(
|
|
||||||
host=host,
|
|
||||||
port=port,
|
|
||||||
user=user,
|
|
||||||
password=password,
|
|
||||||
dbname=dbname
|
|
||||||
)
|
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
|
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
|
||||||
|
|
||||||
register_vector(conn)
|
register_vector(conn)
|
||||||
|
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
CREATE TABLE IF NOT EXISTS vectorsvevijesti (
|
CREATE TABLE IF NOT EXISTS vectorsvevijesti (
|
||||||
id bigserial PRIMARY KEY,
|
id bigserial PRIMARY KEY,
|
||||||
@@ -298,10 +188,8 @@ def create_db(conn):
|
|||||||
similar_d VARCHAR,
|
similar_d VARCHAR,
|
||||||
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
ready BOOLEAN
|
ready BOOLEAN
|
||||||
|
|
||||||
);
|
);
|
||||||
''')
|
''')
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
create_db(conn)
|
create_db()
|
||||||
|
|||||||
Reference in New Issue
Block a user