Changing from js to golang
This commit is contained in:
@@ -2,6 +2,6 @@ OPENAI_API_KEY = "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"
|
||||
|
||||
DB_HOST =localhost
|
||||
DB_PORT =5432
|
||||
DB_USER =postgres
|
||||
DB_USER =svevijesti
|
||||
DB_PASSWORD =salmonela pljusti 221 hamo
|
||||
DB_NAME =svevijestiweb
|
||||
BIN
pyth/__pycache__/db_management.cpython-310.pyc
Normal file
BIN
pyth/__pycache__/db_management.cpython-310.pyc
Normal file
Binary file not shown.
BIN
pyth/__pycache__/get_articles.cpython-310.pyc
Normal file
BIN
pyth/__pycache__/get_articles.cpython-310.pyc
Normal file
Binary file not shown.
BIN
pyth/__pycache__/publishing_finals.cpython-310.pyc
Normal file
BIN
pyth/__pycache__/publishing_finals.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
pyth/__pycache__/tttt.cpython-310.pyc
Normal file
BIN
pyth/__pycache__/tttt.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
241
pyth/articles.py
241
pyth/articles.py
@@ -1,241 +0,0 @@
|
||||
import psycopg2
|
||||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
from openai import OpenAI
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings
|
||||
from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens
|
||||
import json
|
||||
from json_repair import repair_json
|
||||
|
||||
load_dotenv()
|
||||
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
||||
client = OpenAI()
|
||||
embeddings = OpenAIEmbeddings()
|
||||
|
||||
print(f"Checking for similar!")
|
||||
|
||||
def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
|
||||
try:
|
||||
titles, links, embeddings = get_titles_links_embeddings()
|
||||
|
||||
processed_articles = set()
|
||||
grouped_similar_articles = []
|
||||
|
||||
for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
|
||||
if (title1, link1) not in processed_articles:
|
||||
processed_articles.add((title1, link1))
|
||||
group = [(title1, link1)]
|
||||
|
||||
for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
|
||||
if i != j and (title2, link2) not in processed_articles:
|
||||
similarity = calculate_cosine_similarity(embedding1, embedding2)
|
||||
|
||||
if similarity > threshold:
|
||||
processed_articles.add((title2, link2))
|
||||
group.append((title2, link2))
|
||||
|
||||
grouped_similar_articles.append(group)
|
||||
|
||||
return grouped_similar_articles
|
||||
|
||||
except psycopg2.Error as e:
|
||||
print(f"Error: {e}")
|
||||
return []
|
||||
|
||||
def processing_similar():
|
||||
grouped_similar_articles_result = find_and_group_similar_articles()
|
||||
|
||||
if grouped_similar_articles_result:
|
||||
for group in grouped_similar_articles_result:
|
||||
articles = []
|
||||
|
||||
if len(group) > 1:
|
||||
for article_tuple in group:
|
||||
if len(article_tuple) >= 2:
|
||||
title, link = article_tuple[:2]
|
||||
article = [title, link]
|
||||
articles.append(article)
|
||||
l = len(articles)
|
||||
|
||||
if l == 2:
|
||||
a_one = articles[0][0]
|
||||
a_two = articles[1][0]
|
||||
|
||||
get_one = get_specific_data(a_one)
|
||||
get_two = get_specific_data(a_two)
|
||||
|
||||
text1 = get_one[0][1]
|
||||
text2 = get_two[0][1]
|
||||
link1 = get_one[0][2]
|
||||
link2 = get_two[0][2]
|
||||
if link1 != link2:
|
||||
link = f"{link1}, {link2}"
|
||||
else:
|
||||
link = link1
|
||||
|
||||
ftoks = num_tokens_from_string(text1)
|
||||
stoks = num_tokens_from_string(text2)
|
||||
tokens = ftoks + stoks
|
||||
|
||||
similar_d = f"C: {a_one}, {a_two}"
|
||||
|
||||
modify_similar_data(similar_d, a_one)
|
||||
preparing_articles(False, a_one)
|
||||
|
||||
modify_similar_data(similar_d, a_two)
|
||||
preparing_articles(False, a_two)
|
||||
|
||||
if tokens > 2000:
|
||||
combined_text = f"{text1} {text2}"
|
||||
combined_text = slice_text_at_2k_tokens(combined_text)
|
||||
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
|
||||
else:
|
||||
user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
|
||||
|
||||
if l == 3:
|
||||
a_one = articles[0][0]
|
||||
a_two = articles[1][0]
|
||||
a_three = articles[2][0]
|
||||
|
||||
get_one = get_specific_data(a_one)
|
||||
get_two = get_specific_data(a_two)
|
||||
get_three = get_specific_data(a_three)
|
||||
|
||||
text1 = get_one[0][1]
|
||||
text2 = get_two[0][1]
|
||||
text3 = get_three[0][1]
|
||||
link1 = get_one[0][2]
|
||||
link2 = get_two[0][2]
|
||||
link3 = get_three[0][2]
|
||||
if link1 != link2:
|
||||
if link2 != link3:
|
||||
link = f"{link1}, {link2}, {link3}"
|
||||
else:
|
||||
link = f"{link1}, {link2}"
|
||||
else:
|
||||
if link2 != link3:
|
||||
link = f"{link1}, {link3}"
|
||||
else:
|
||||
link = link1
|
||||
ftoks = num_tokens_from_string(text1)
|
||||
stoks = num_tokens_from_string(text2)
|
||||
ttoks = num_tokens_from_string(text3)
|
||||
tokens = ftoks + stoks + ttoks
|
||||
|
||||
similar_d = f"C: {a_one}, {a_two}, {a_three}"
|
||||
modify_similar_data(similar_d, a_one)
|
||||
preparing_articles(False, a_one)
|
||||
|
||||
modify_similar_data(similar_d, a_two)
|
||||
preparing_articles(False, a_two)
|
||||
|
||||
modify_similar_data(similar_d, a_three)
|
||||
preparing_articles(False, a_three)
|
||||
|
||||
if tokens > 2000:
|
||||
combined_text = f"{text1} {text2} {text3}"
|
||||
combined_text = slice_text_at_2k_tokens(combined_text)
|
||||
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
|
||||
else:
|
||||
user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
|
||||
if l == 4:
|
||||
a_one = articles[0][0]
|
||||
a_two = articles[1][0]
|
||||
a_three = articles[2][0]
|
||||
a_four = articles[3][0]
|
||||
|
||||
get_one = get_specific_data(a_one)
|
||||
get_two = get_specific_data(a_two)
|
||||
get_three = get_specific_data(a_three)
|
||||
get_four = get_specific_data(a_four)
|
||||
|
||||
text1 = get_one[0][1]
|
||||
text2 = get_two[0][1]
|
||||
text3 = get_three[0][1]
|
||||
text4 = get_four[0][1]
|
||||
link1 = get_one[0][2]
|
||||
link2 = get_two[0][2]
|
||||
link3 = get_three[0][2]
|
||||
link4 = get_four[0][2]
|
||||
|
||||
if link1 != link2:
|
||||
if link2 != link3:
|
||||
if link3 != link4:
|
||||
link = f"{link1}, {link2}, {link3}, {link4}"
|
||||
else:
|
||||
link = f"{link1}, {link2}, {link3}"
|
||||
else:
|
||||
if link3 != link4:
|
||||
link = f"{link1}, {link2}, {link4}"
|
||||
else:
|
||||
link = f"{link1}, {link2}"
|
||||
else:
|
||||
if link2 != link3:
|
||||
if link3 != link4:
|
||||
link = f"{link1}, {link3}, {link4}"
|
||||
else:
|
||||
link = f"{link1}, {link3}"
|
||||
else:
|
||||
if link3 != link4:
|
||||
link = f"{link1}, {link4}"
|
||||
else:
|
||||
link = link1
|
||||
|
||||
ftoks = num_tokens_from_string(text1)
|
||||
stoks = num_tokens_from_string(text2)
|
||||
ttoks = num_tokens_from_string(text3)
|
||||
frtoks = num_tokens_from_string(text4)
|
||||
|
||||
tokens = ftoks + stoks + ttoks + frtoks
|
||||
|
||||
similar_d = f"C: {a_one}, {a_two}, {a_three}, {a_four}"
|
||||
modify_similar_data(similar_d, a_one)
|
||||
preparing_articles(False, a_one)
|
||||
|
||||
modify_similar_data(similar_d, a_two)
|
||||
preparing_articles(False, a_two)
|
||||
|
||||
modify_similar_data(similar_d, a_three)
|
||||
preparing_articles(False, a_three)
|
||||
|
||||
modify_similar_data(similar_d, a_four)
|
||||
preparing_articles(False, a_four)
|
||||
|
||||
if tokens > 2000:
|
||||
combined_text = f"{text1} {text2} {text3} {text4}"
|
||||
combined_text = slice_text_at_2k_tokens(combined_text)
|
||||
user_message = rf"Here is text {combined_text}, combined from 4 sources, filter text, and make news content, return as JSON only with a single 'content' field"
|
||||
else:
|
||||
user_message = rf"Here are 4 texts {text1} {text2} {text3} and {text4}, combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single 'content' field."
|
||||
try:
|
||||
completion = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
||||
{"role": "user", "content": user_message}
|
||||
])
|
||||
generated_text = completion.choices[0].message.content
|
||||
|
||||
generated_text = repair_json(generated_text)
|
||||
|
||||
response_data = json.loads(generated_text)
|
||||
title = a_one
|
||||
text = response_data["content"]
|
||||
vector = embeddings.embed_query(generated_text)
|
||||
|
||||
insert_data(title, text, link, vector, similar_d)
|
||||
print(f"Inserting combined: {title}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
print(a_one)
|
||||
continue
|
||||
else:
|
||||
print("Done!.")
|
||||
else:
|
||||
print("No similar articles found.")
|
||||
if __name__=="__main__":
|
||||
processing_similar()
|
||||
122
pyth/checking_similar.py
Normal file
122
pyth/checking_similar.py
Normal file
@@ -0,0 +1,122 @@
|
||||
import psycopg2
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
from openai import OpenAI
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
from db_management import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity, get_titles_links_embeddings
|
||||
from get_articles import slice_text_at_2k_tokens
|
||||
import json
|
||||
from json_repair import repair_json
|
||||
from publishing_finals import publish_articles
|
||||
|
||||
load_dotenv()
|
||||
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
||||
client = OpenAI()
|
||||
embeddings = OpenAIEmbeddings()
|
||||
|
||||
print("Checking for similar!")
|
||||
|
||||
|
||||
def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
|
||||
try:
|
||||
titles, links, embeddings = get_titles_links_embeddings()
|
||||
|
||||
processed_articles = set()
|
||||
grouped_similar_articles = []
|
||||
|
||||
for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
|
||||
if (title1, link1) not in processed_articles:
|
||||
processed_articles.add((title1, link1))
|
||||
group = [(title1, link1)]
|
||||
|
||||
for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
|
||||
if i != j and (title2, link2) not in processed_articles:
|
||||
similarity = calculate_cosine_similarity(embedding1, embedding2)
|
||||
|
||||
if similarity > threshold:
|
||||
if link1 != link2:
|
||||
processed_articles.add((title2, link2))
|
||||
group.append((title2, link2, embedding2))
|
||||
|
||||
grouped_similar_articles.append(group)
|
||||
return grouped_similar_articles
|
||||
|
||||
except psycopg2.Error as e:
|
||||
print(f"Error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def processing_articles(articles):
|
||||
unique_links = set()
|
||||
|
||||
for article in articles:
|
||||
a_title, a_link = article[:2]
|
||||
get_data = get_specific_data(a_title)
|
||||
text = get_data[0][1]
|
||||
link = a_link
|
||||
|
||||
modify_similar_data(f"C: {', '.join(art[0] for art in articles)}", a_title)
|
||||
preparing_articles(False, a_title)
|
||||
|
||||
if link not in unique_links:
|
||||
unique_links.add(link)
|
||||
|
||||
combined_text = ' '.join(get_specific_data(art[0])[0][1] for art in articles)
|
||||
combined_text = slice_text_at_2k_tokens(combined_text)
|
||||
|
||||
if len(unique_links) == 1:
|
||||
link = next(iter(unique_links))
|
||||
else:
|
||||
link = ', '.join(unique_links)
|
||||
return combined_text, link
|
||||
|
||||
|
||||
def processing_similar():
|
||||
grouped_similar_articles_result = find_and_group_similar_articles()
|
||||
|
||||
if grouped_similar_articles_result:
|
||||
for group in grouped_similar_articles_result:
|
||||
articles = group
|
||||
|
||||
if len(articles) > 1:
|
||||
combined_text, link = processing_articles(articles)
|
||||
user_message = (
|
||||
rf"Here are {len(articles)} texts {combined_text}, combine the following texts into a cohesive news, "
|
||||
rf"remove any non-news related to all texts, and provide the cleaned data on Bosnian languageas and return as JSON only with a single 'content' field."
|
||||
)
|
||||
|
||||
try:
|
||||
completion = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
||||
{"role": "user", "content": user_message}
|
||||
])
|
||||
generated_text = repair_json(completion.choices[0].message.content)
|
||||
|
||||
response_data = json.loads(generated_text)
|
||||
title = articles[0][0]
|
||||
text = response_data["content"]
|
||||
vector = embeddings.embed_query(generated_text)
|
||||
tmpCategory = get_specific_data(title)
|
||||
category = tmpCategory[0][5]
|
||||
|
||||
|
||||
|
||||
insert_data(title, text, link, vector, f"C: {', '.join(art[0] for art in articles)}", category)
|
||||
print(f"Inserting combined: {title} and Category: {category}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
print(articles[0][0])
|
||||
continue
|
||||
else:
|
||||
print("Done!.")
|
||||
else:
|
||||
print("No similar articles found.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
processing_similar()
|
||||
publish_articles()
|
||||
@@ -68,7 +68,7 @@ def is_similar_data(title, text, link, embedding, threshold=0.98):
|
||||
|
||||
def get_similar():
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
|
||||
query = '''SELECT title, link, similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
|
||||
cursor.execute(query)
|
||||
similar_data = cursor.fetchall()
|
||||
cursor.close()
|
||||
@@ -87,18 +87,23 @@ def get_titles_links_embeddings():
|
||||
return titles, links, embeddings
|
||||
|
||||
|
||||
def insert_data(title, text, link, embedding, similar_d):
|
||||
def insert_data(title, text, link, embedding, similar_d,category):
|
||||
c_time = datetime.now()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready)
|
||||
VALUES (%s, %s, %s, %s, %s ,%s ,%s);
|
||||
''', (title, text, link, embedding , similar_d, c_time, True))
|
||||
INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready, category)
|
||||
VALUES (%s, %s, %s, %s, %s ,%s ,%s ,%s);
|
||||
''', (title, text, link, embedding , similar_d, c_time, True , category))
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def get_data():
|
||||
def insert_final(title,text,slug,link,source_id, category):
|
||||
with conn.cursor() as cursor:
|
||||
cursor.execute('''INSERT INTO articles (title, content, slug, original_url, source_id, category)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)ON CONFLICT (original_url) DO NOTHING;''',(title , text, slug, link, source_id, category))
|
||||
conn.commit()
|
||||
|
||||
def get_data():
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT title,text,link FROM vectorsvevijesti;'''
|
||||
cursor.execute(query)
|
||||
@@ -108,7 +113,7 @@ def get_data():
|
||||
|
||||
def get_ready_data():
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
|
||||
query = '''SELECT title, text, link, time, similar_d, category FROM vectorsvevijesti WHERE ready = %s;'''
|
||||
cursor.execute(query, ('True',))
|
||||
data = cursor.fetchall()
|
||||
cursor.close()
|
||||
@@ -122,14 +127,12 @@ def get_source_data():
|
||||
cursor.close()
|
||||
return data
|
||||
|
||||
|
||||
def modify_similar_data(new_value ,title):
|
||||
cursor = conn.cursor()
|
||||
query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s '''
|
||||
cursor.execute(query, (new_value, title))
|
||||
conn.commit()
|
||||
|
||||
|
||||
def preparing_articles(new_value ,title):
|
||||
cursor = conn.cursor()
|
||||
query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s '''
|
||||
@@ -138,13 +141,12 @@ def preparing_articles(new_value ,title):
|
||||
|
||||
def get_specific_data(title):
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT title, text, link, similar_d, embedding, ready FROM vectorsvevijesti WHERE title = %s'''
|
||||
query = '''SELECT title, text, link, similar_d, embedding, category, ready FROM vectorsvevijesti WHERE title = %s'''
|
||||
cursor.execute(query, (title,))
|
||||
specific_post = cursor.fetchall()
|
||||
cursor.close()
|
||||
return specific_post
|
||||
|
||||
|
||||
def get_all_links():
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT link FROM vectorsvevijesti'''
|
||||
@@ -153,6 +155,14 @@ def get_all_links():
|
||||
cursor.close()
|
||||
return db_links
|
||||
|
||||
def get_existing_titles():
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT title, original_url FROM articles'''
|
||||
cursor.execute(query)
|
||||
db_links = {link[0] for link in cursor.fetchall()}
|
||||
cursor.close()
|
||||
return db_links
|
||||
|
||||
def delete_specific(title):
|
||||
cursor = conn.cursor()
|
||||
query = '''DELETE FROM vectorsvevijesti WHERE title = %s'''
|
||||
@@ -192,4 +202,48 @@ def create_db():
|
||||
''')
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def create_db():
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
|
||||
register_vector(conn)
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS vectorsvevijesti (
|
||||
id bigserial PRIMARY KEY,
|
||||
title VARCHAR,
|
||||
text VARCHAR,
|
||||
link VARCHAR,
|
||||
embedding vector(1536),
|
||||
similar_d VARCHAR,
|
||||
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
ready BOOLEAN,
|
||||
category VARCHAR
|
||||
);
|
||||
''')
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def create_ar_table():
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS "articles" (
|
||||
"id" bigserial PRIMARY KEY,
|
||||
"title" text NOT NULL UNIQUE,
|
||||
"content" text NOT NULL,
|
||||
"slug" text NOT NULL UNIQUE,
|
||||
"created_at" timestamptz DEFAULT NOW() NOT NULL,
|
||||
"original_url" text NOT NULL UNIQUE,
|
||||
"source_id" int NOT NULL,
|
||||
"category" VARCHAR
|
||||
|
||||
);
|
||||
''')
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
import psycopg2
|
||||
from psycopg2 import sql
|
||||
|
||||
|
||||
create_db()
|
||||
create_ar_table()
|
||||
2
pyth/delete_db.py
Normal file
2
pyth/delete_db.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from db_management import delete_tables
|
||||
delete_tables()
|
||||
@@ -3,8 +3,8 @@ import requests
|
||||
from urllib.parse import urljoin
|
||||
from openai import OpenAI
|
||||
import os
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from vectData import (insert_data ,is_similar_data ,get_all_links,cleansing )
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
from db_management import (insert_data ,is_similar_data ,get_all_links,cleansing )
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
import tiktoken
|
||||
@@ -18,7 +18,7 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
||||
client = OpenAI()
|
||||
embeddings = OpenAIEmbeddings()
|
||||
|
||||
dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
|
||||
dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info','https://www.index.hr', 'https://avaz.ba', 'https://www.telegraf.rs', 'https://www.blic.rs', 'https://www.vijesti.me','https://dnevnik.hr','https://24sata.hr']
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
|
||||
|
||||
def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:
|
||||
@@ -97,50 +97,65 @@ final_links = fix_links(final_links)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
for link in final_links:
|
||||
response = requests.get(link,headers)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
for link in final_links:
|
||||
if link not in db_links:
|
||||
print(f"Processing link: {link}")
|
||||
db_links.add(link)
|
||||
|
||||
titles = soup.find_all(['h2', 'h1','h3'])
|
||||
title_text = ' '.join([title.get_text(strip=True) for title in titles])
|
||||
response = requests.get(link,headers)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
texts = soup.find_all(['p'])
|
||||
text_text = ' '.join([text.get_text(strip=True) for text in texts])
|
||||
titles = soup.find_all(['h2', 'h1','h3'])
|
||||
title_text = ' '.join([title.get_text(strip=True) for title in titles])
|
||||
|
||||
text_text = text_text
|
||||
title_text = title_text
|
||||
texts = soup.find_all(['p'])
|
||||
text_text = ' '.join([text.get_text(strip=True) for text in texts])
|
||||
|
||||
text_text = text_text
|
||||
title_text = title_text
|
||||
|
||||
title_text = replace_with_spaces(title_text)
|
||||
title_text = replace_with_spaces(title_text)
|
||||
|
||||
text_text = slice_text_at_2k_tokens(text_text)
|
||||
text_text = replace_with_spaces(str(text_text))
|
||||
text_text = slice_text_at_2k_tokens(text_text)
|
||||
text_text = replace_with_spaces(str(text_text))
|
||||
|
||||
ttk = num_tokens_from_string(text_text)
|
||||
ttk = num_tokens_from_string(text_text)
|
||||
|
||||
if ttk > 1900:
|
||||
title_text = slice_title_if_needed(title_text)
|
||||
try:
|
||||
completion = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
||||
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data make sure that its valid JSON object with 'title' field and 'content' field."}
|
||||
]
|
||||
)
|
||||
generated_text = completion.choices[0].message.content
|
||||
category_options = ['politics','business','sport','magazine','scitech']
|
||||
|
||||
generated_text = repair_json(generated_text)
|
||||
if ttk > 1900:
|
||||
title_text = slice_title_if_needed(title_text)
|
||||
try:
|
||||
completion = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
||||
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title and remove 'FOTO' and 'VIDEO' from title and text, from {category_options} select category in wich that news belong, and provide the cleaned data make sure that its on Bosnian language and valid JSON object with 'title' field, 'category' and 'content' field."}
|
||||
])
|
||||
generated_text = completion.choices[0].message.content
|
||||
|
||||
response_data = json.loads(generated_text)
|
||||
title = response_data["title"]
|
||||
text = response_data["content"]
|
||||
vector = embeddings.embed_query(generated_text)
|
||||
generated_text = repair_json(generated_text)
|
||||
|
||||
response_data = json.loads(generated_text)
|
||||
title = response_data["title"]
|
||||
predicted_category = response_data["category"]
|
||||
text = response_data["content"]
|
||||
|
||||
if predicted_category.lower() in category_options:
|
||||
category = predicted_category.lower()
|
||||
else:
|
||||
category = 'other'
|
||||
|
||||
vector = embeddings.embed_query(generated_text)
|
||||
|
||||
print(f"Title: {title}")
|
||||
print(f"Category: {category}")
|
||||
|
||||
if not is_similar_data(title, text, link, vector, threshold=0.98):
|
||||
similar_d = "NO"
|
||||
insert_data(title, text, link, vector,similar_d)
|
||||
if not is_similar_data(title, text, link, vector, threshold=0.98):
|
||||
similar_d = "NO"
|
||||
insert_data(title, text, link, vector,similar_d,category)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in completion: {e}")
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"Error in completion: {e}")
|
||||
continue
|
||||
|
||||
69
pyth/publishing_finals.py
Normal file
69
pyth/publishing_finals.py
Normal file
@@ -0,0 +1,69 @@
|
||||
from slugify import slugify
|
||||
import random
|
||||
from db_management import get_ready_data,insert_final,get_existing_titles
|
||||
|
||||
def create_slug(title):
|
||||
base_slug = "{} {}".format(random.randint(1, 1000), title)
|
||||
slug = slugify(base_slug)
|
||||
return slug
|
||||
|
||||
def get_source_id(link,similar):
|
||||
if similar == "NO":
|
||||
if "srpskainfo" in link:
|
||||
return 1
|
||||
elif "klix" in link:
|
||||
return 2
|
||||
elif "bljesak" in link:
|
||||
return 3
|
||||
elif "blic" in link:
|
||||
return 4
|
||||
elif "index.hr" in link:
|
||||
return 6
|
||||
elif "avaz" in link:
|
||||
return 7
|
||||
elif "telegraf" in link:
|
||||
return 8
|
||||
elif "vijesti.me" in link:
|
||||
return 9
|
||||
elif "dnevnik.hr" in link:
|
||||
return 10
|
||||
elif "24sata.hr" in link:
|
||||
return 11
|
||||
else:
|
||||
return 0
|
||||
else:
|
||||
return 5
|
||||
|
||||
data = get_ready_data()
|
||||
|
||||
def remove_braces_and_quotes(text):
|
||||
final_text = text.replace('{"', '')
|
||||
final_text = final_text.replace('"}', '')
|
||||
|
||||
return final_text
|
||||
|
||||
|
||||
def publish_articles():
|
||||
for d in data:
|
||||
title = d[0]
|
||||
text = d[1]
|
||||
link = d[2]
|
||||
similar_d = d[4]
|
||||
category = d[5]
|
||||
slug = create_slug(title)
|
||||
source_id = get_source_id(link,similar_d)
|
||||
|
||||
check = get_existing_titles()
|
||||
|
||||
title_check = any(title in t for t in check)
|
||||
link_check = any(link in l for l in check)
|
||||
|
||||
if title_check or link_check:
|
||||
continue
|
||||
else:
|
||||
text = remove_braces_and_quotes(text)
|
||||
title = remove_braces_and_quotes(title)
|
||||
print(f"Source: {source_id}")
|
||||
print(f"Link: {link}")
|
||||
insert_final(title, text, slug, link, source_id, category)
|
||||
print(f"Publishing: {title}")
|
||||
@@ -1,22 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Test Pyth</title>
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<article>
|
||||
<h2>Test Title 1</h2>
|
||||
<p>Test Text 1</p>
|
||||
<a href="/article/one"> First</a>
|
||||
</article>
|
||||
<article>
|
||||
<h2>Test Title 2</h2>
|
||||
<p>Test Text 2</p>
|
||||
<a href="/article/two">Second</a>
|
||||
</article>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1,12 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Article</title>
|
||||
</head>
|
||||
<body>
|
||||
<h2>Test Title</h2>
|
||||
<p>Test Text</p>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1,12 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Article</title>
|
||||
</head>
|
||||
<body>
|
||||
<h2>Test Title</h2>
|
||||
<p>Test Text</p>
|
||||
</body>
|
||||
</html>
|
||||
@@ -7,7 +7,7 @@ from langchain.vectorstores.pgvector import PGVector
|
||||
from openai import OpenAI
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
from scrapingsingle import get_article_links, insert_data, is_similar_data
|
||||
from pyth.get_articles import get_article_links, insert_data, is_similar_data
|
||||
import os
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@@ -2,7 +2,7 @@ import unittest
|
||||
import numpy as np
|
||||
import psycopg2
|
||||
import os
|
||||
from vectData import calculate_cosine_similarity, is_similar_data, insert_data, get_data, create_db
|
||||
from pyth.db_management import calculate_cosine_similarity, is_similar_data, insert_data, get_data, create_db
|
||||
|
||||
class TestIntegration(unittest.TestCase):
|
||||
host = os.getenv("DB_HOST")
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
from flask import Flask , render_template , jsonify
|
||||
from vectData import get_ready_data
|
||||
from flask_cors import CORS
|
||||
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
CORS(app)
|
||||
|
||||
@app.route('/')
|
||||
def index() :
|
||||
return render_template("index.html")
|
||||
|
||||
|
||||
@app.route('/article/one')
|
||||
def articleone():
|
||||
return render_template("one.html")
|
||||
|
||||
|
||||
@app.route('/article/two')
|
||||
def articletwo():
|
||||
return render_template("two.html")
|
||||
|
||||
@app.route('/data/get/news', methods=['GET'])
|
||||
def takenews():
|
||||
data = get_ready_data()
|
||||
return jsonify(data)
|
||||
|
||||
app.run(debug=True)
|
||||
Reference in New Issue
Block a user