added article.py
This commit is contained in:
BIN
pyth/__pycache__/articles.cpython-310.pyc
Normal file
BIN
pyth/__pycache__/articles.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
231
pyth/articles.py
Normal file
231
pyth/articles.py
Normal file
@@ -0,0 +1,231 @@
|
||||
import psycopg2
|
||||
import numpy as np
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
from openai import OpenAI , APIError
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, get_source_data, get_ready_data
|
||||
import tiktoken
|
||||
from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens
|
||||
import json
|
||||
|
||||
load_dotenv()
|
||||
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
||||
client = OpenAI()
|
||||
embeddings = OpenAIEmbeddings()
|
||||
|
||||
print(f"Checking for similar!")
|
||||
|
||||
host = os.getenv("DB_HOST")
|
||||
port = os.getenv("DB_PORT")
|
||||
user = os.getenv("DB_USER")
|
||||
password = os.getenv("DB_PASSWORD")
|
||||
dbname = os.getenv("DB_NAME")
|
||||
|
||||
def calculate_cosine_similarity(v1, v2):
|
||||
v1_normalized = v1 / np.linalg.norm(v1)
|
||||
v2_normalized = v2 / np.linalg.norm(v2)
|
||||
|
||||
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
|
||||
return similarity
|
||||
|
||||
def parse_embedding_string(embedding_str):
|
||||
if isinstance(embedding_str, str):
|
||||
numbers = [float(num) for num in embedding_str[1:-1].split(',')]
|
||||
return np.array(numbers)
|
||||
elif isinstance(embedding_str, np.ndarray):
|
||||
return embedding_str
|
||||
else:
|
||||
raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.")
|
||||
|
||||
|
||||
def get_titles_links_embeddings():
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;')
|
||||
data = cursor.fetchall()
|
||||
cursor.close()
|
||||
|
||||
titles = [row[0] for row in data]
|
||||
links = [row[1] for row in data]
|
||||
embeddings = [parse_embedding_string(row[2]) for row in data]
|
||||
|
||||
return titles, links, embeddings
|
||||
|
||||
def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
|
||||
try:
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
|
||||
with conn, conn.cursor() as cursor:
|
||||
titles, links, embeddings = get_titles_links_embeddings()
|
||||
|
||||
processed_articles = set()
|
||||
grouped_similar_articles = []
|
||||
|
||||
for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
|
||||
if (title1, link1) not in processed_articles:
|
||||
processed_articles.add((title1, link1))
|
||||
group = [(title1, link1)]
|
||||
|
||||
for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
|
||||
if i != j and (title2, link2) not in processed_articles:
|
||||
similarity = calculate_cosine_similarity(embedding1, embedding2)
|
||||
|
||||
if similarity > threshold:
|
||||
processed_articles.add((title2, link2))
|
||||
group.append((title2, link2))
|
||||
|
||||
grouped_similar_articles.append(group)
|
||||
|
||||
return grouped_similar_articles
|
||||
|
||||
except psycopg2.Error as e:
|
||||
print(f"Error: {e}")
|
||||
return []
|
||||
|
||||
def processing_similar():
|
||||
grouped_similar_articles_result = find_and_group_similar_articles()
|
||||
|
||||
if grouped_similar_articles_result:
|
||||
|
||||
for group in grouped_similar_articles_result:
|
||||
articles = []
|
||||
|
||||
if len(group) > 1:
|
||||
for article_tuple in group:
|
||||
if len(article_tuple) >= 2:
|
||||
title, link = article_tuple[:2]
|
||||
article = [title, link]
|
||||
articles.append(article)
|
||||
l = len(articles)
|
||||
if l == 2:
|
||||
print("2")
|
||||
a_one = articles[0][0]
|
||||
a_two = articles[1][0]
|
||||
|
||||
get_one = get_specific_data(a_one)
|
||||
get_two = get_specific_data(a_two)
|
||||
|
||||
text1 = get_one[0][1]
|
||||
text2 = get_two[0][1]
|
||||
link1 = get_one[0][2]
|
||||
link2 = get_two[0][2]
|
||||
if link1 != link2:
|
||||
link = f"{link1}, {link2}"
|
||||
else:
|
||||
link = link1
|
||||
|
||||
ftoks = num_tokens_from_string(text1)
|
||||
stoks = num_tokens_from_string(text2)
|
||||
tokens = ftoks + stoks
|
||||
|
||||
similar_d = f"C: {a_one}, {a_two}"
|
||||
|
||||
modify_similar_data(similar_d, a_one)
|
||||
preparing_articles(False, a_one)
|
||||
|
||||
modify_similar_data(similar_d, a_two)
|
||||
preparing_articles(False, a_two)
|
||||
|
||||
print(tokens)
|
||||
if tokens > 2000:
|
||||
combined_text = f"{text1} {text2}"
|
||||
combined_text = slice_text_at_2k_tokens(combined_text)
|
||||
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
|
||||
else:
|
||||
user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
|
||||
|
||||
if l == 3:
|
||||
print("3")
|
||||
a_one = articles[0][0]
|
||||
a_two = articles[1][0]
|
||||
a_three = articles[2][0]
|
||||
|
||||
get_one = get_specific_data(a_one)
|
||||
get_two = get_specific_data(a_two)
|
||||
get_three = get_specific_data(a_three)
|
||||
|
||||
text1 = get_one[0][1]
|
||||
text2 = get_two[0][1]
|
||||
text3 = get_three[0][1]
|
||||
link1 = get_one[0][2]
|
||||
link2 = get_two[0][2]
|
||||
link3 = get_three[0][2]
|
||||
if link1 != link2:
|
||||
if link2 != link3:
|
||||
link = f"{link1}, {link2}, {link3}"
|
||||
else:
|
||||
link = f"{link1}, {link2}"
|
||||
else:
|
||||
if link2 != link3:
|
||||
link = f"{link1}, {link3}"
|
||||
else:
|
||||
link = link1
|
||||
ftoks = num_tokens_from_string(text1)
|
||||
stoks = num_tokens_from_string(text2)
|
||||
ttoks = num_tokens_from_string(text3)
|
||||
tokens = ftoks + stoks + ttoks
|
||||
|
||||
similar_d = f"C: {a_one}, {a_two}, {a_three}"
|
||||
modify_similar_data(similar_d, a_one)
|
||||
preparing_articles(False, a_one)
|
||||
|
||||
modify_similar_data(similar_d, a_two)
|
||||
preparing_articles(False, a_two)
|
||||
|
||||
modify_similar_data(similar_d, a_three)
|
||||
preparing_articles(False, a_three)
|
||||
|
||||
print(tokens)
|
||||
if tokens > 2000:
|
||||
combined_text = f"{text1} {text2} {text3}"
|
||||
combined_text = slice_text_at_2k_tokens(combined_text)
|
||||
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
|
||||
else:
|
||||
user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
|
||||
try:
|
||||
completion = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
||||
{"role": "user", "content": user_message}
|
||||
])
|
||||
generated_text = completion.choices[0].message.content
|
||||
|
||||
response_data = json.loads(generated_text)
|
||||
title = a_one
|
||||
text = response_data["content"]
|
||||
vector = embeddings.embed_query(generated_text)
|
||||
|
||||
insert_data(title, text, link, vector, similar_d)
|
||||
print(f"Inserting combined: {title}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
print(f"Title: {a_one}")
|
||||
print(f"Answer: {generated_text}")
|
||||
continue
|
||||
else:
|
||||
print("No similar articles found.")
|
||||
if __name__=="__main__":
|
||||
processing_similar()
|
||||
ready = get_ready_data()
|
||||
if ready:
|
||||
for a in ready:
|
||||
print(f"Title: {a[0]}")
|
||||
print(f"Link: {a[2]}")
|
||||
print(f"Status: {a[3]}")
|
||||
@@ -4,7 +4,7 @@ from urllib.parse import urljoin
|
||||
from openai import OpenAI , APIError
|
||||
import os
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data, delete_specific,get_all_links,cleansing ,modify_similar_data)
|
||||
from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data,get_all_links,cleansing ,modify_similar_data)
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
import tiktoken
|
||||
@@ -48,6 +48,19 @@ def replace_with_spaces(text):
|
||||
cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)
|
||||
return cleaned_text
|
||||
|
||||
|
||||
def fix_links(links_set):
|
||||
modified_links = set()
|
||||
|
||||
for link in links_set:
|
||||
if "www" in link:
|
||||
modified_link = link.replace("www.", "")
|
||||
modified_links.add(modified_link)
|
||||
else:
|
||||
modified_links.add(link)
|
||||
|
||||
return modified_links
|
||||
|
||||
total_links = set()
|
||||
collected_news = set()
|
||||
|
||||
@@ -78,13 +91,13 @@ for dlink in dlinks:
|
||||
total_links.update(temp_links)
|
||||
|
||||
final_links = {item for item in total_links if item}
|
||||
i = 0
|
||||
|
||||
db_links = set(get_all_links())
|
||||
new_links = final_links - db_links
|
||||
final_links = new_links
|
||||
final_links = set(final_links)
|
||||
|
||||
|
||||
final_links = fix_links(final_links)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -142,6 +155,7 @@ if __name__ == '__main__':
|
||||
print(f"Error in completion: {e}")
|
||||
continue
|
||||
|
||||
|
||||
def comb_similar():
|
||||
|
||||
print("Checking similar")
|
||||
@@ -185,12 +199,17 @@ def comb_similar():
|
||||
combined_text = f"{text1}{text2}{text3}"
|
||||
combined_text = slice_text_at_2k_tokens(combined_text)
|
||||
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with 'content' field"
|
||||
link = f"{link1} {link2} {link3}"
|
||||
if link1 != link2 and link1 != link3 and link2 != link3:
|
||||
link = f"{link1} {link2} {link3}"
|
||||
else:
|
||||
link = link1
|
||||
|
||||
else:
|
||||
user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field."
|
||||
link = f"{link1} {link2} {link3}"
|
||||
|
||||
if link1 != link2 and link1 != link3 and link2 != link3:
|
||||
link = f"{link1} {link2} {link3}"
|
||||
else:
|
||||
link = link1
|
||||
else:
|
||||
ftcheck = num_tokens_from_string(f_text)
|
||||
stcheck = num_tokens_from_string(s_text)
|
||||
@@ -198,12 +217,17 @@ def comb_similar():
|
||||
if fscomb <2000:
|
||||
combined_text = f"{f_text}{s_text}"
|
||||
user_message = rf"Here is text {combined_text}, combined from 2 sources, filter text, and make news content, return as JSON only with 'content' field"
|
||||
link = f"{link_f} {link_s}"
|
||||
if link_f != link_s:
|
||||
link = f"{link_f} {link_s}"
|
||||
else:
|
||||
link = link_f
|
||||
|
||||
else:
|
||||
user_message = rf"Here are 2 texts {f_text} and {s_text}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field."
|
||||
link = f"{link_f} {link_s}"
|
||||
|
||||
if link_f != link_s:
|
||||
link = f"{link_f} {link_s}"
|
||||
else:
|
||||
link = link_f
|
||||
try:
|
||||
completion = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
@@ -213,7 +237,6 @@ def comb_similar():
|
||||
]
|
||||
)
|
||||
generated_text = completion.choices[0].message.content
|
||||
generated_text = generated_text
|
||||
|
||||
if similar_article:
|
||||
if f_title == s_title:
|
||||
@@ -222,6 +245,7 @@ def comb_similar():
|
||||
similar_article.remove(sa)
|
||||
print("Modified")
|
||||
else:
|
||||
print(f"First: {f_title}")
|
||||
print(f"Second: {s_title}")
|
||||
modify_similar_data(first_t,"SOURCE")
|
||||
modify_similar_data(second_t,"SOURCE")
|
||||
@@ -243,5 +267,3 @@ def comb_similar():
|
||||
except Exception as e:
|
||||
print(f"Error in completion: {e}")
|
||||
continue
|
||||
|
||||
comb_similar()
|
||||
@@ -18,6 +18,5 @@
|
||||
<a href="/article/two">Second</a>
|
||||
</article>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -83,6 +83,7 @@ def get_similar():
|
||||
return similar_data
|
||||
|
||||
|
||||
|
||||
def insert_data(title, text, link, embedding, similar_d):
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
@@ -97,9 +98,9 @@ def insert_data(title, text, link, embedding, similar_d):
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time)
|
||||
VALUES (%s, %s, %s, %s, %s ,%s);
|
||||
''', (title, text, link, embedding , similar_d, c_time))
|
||||
INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready)
|
||||
VALUES (%s, %s, %s, %s, %s ,%s ,%s);
|
||||
''', (title, text, link, embedding , similar_d, c_time, True))
|
||||
|
||||
conn.commit()
|
||||
|
||||
@@ -121,6 +122,39 @@ def get_data():
|
||||
cursor.close()
|
||||
return data
|
||||
|
||||
def get_ready_data():
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
|
||||
|
||||
cursor.execute(query, ('True',))
|
||||
data = cursor.fetchall()
|
||||
cursor.close()
|
||||
return data
|
||||
|
||||
def get_source_data():
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
|
||||
|
||||
cursor.execute(query, ('False',))
|
||||
data = cursor.fetchall()
|
||||
cursor.close()
|
||||
return data
|
||||
|
||||
|
||||
def modify_similar_data(new_value ,title):
|
||||
|
||||
conn = psycopg2.connect(
|
||||
@@ -138,6 +172,24 @@ def modify_similar_data(new_value ,title):
|
||||
|
||||
conn.commit()
|
||||
|
||||
|
||||
def preparing_articles(new_value ,title):
|
||||
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
|
||||
query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s '''
|
||||
|
||||
cursor.execute(query, (new_value, title))
|
||||
|
||||
conn.commit()
|
||||
|
||||
def get_specific_data(title):
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
@@ -244,7 +296,9 @@ def create_db(conn):
|
||||
link VARCHAR,
|
||||
embedding vector(1536),
|
||||
similar_d VARCHAR,
|
||||
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
ready BOOLEAN
|
||||
|
||||
);
|
||||
''')
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from flask import Flask , render_template , jsonify
|
||||
from vectData import get_data
|
||||
from vectData import get_ready_data
|
||||
from flask_cors import CORS
|
||||
|
||||
|
||||
@@ -21,4 +21,9 @@ def articleone():
|
||||
def articletwo():
|
||||
return render_template("two.html")
|
||||
|
||||
@app.route('/data/get/news', methods=['GET'])
|
||||
def takenews():
|
||||
data = get_ready_data()
|
||||
return jsonify(data)
|
||||
|
||||
app.run(debug=True)
|
||||
Reference in New Issue
Block a user