172 lines
6.3 KiB
Python
172 lines
6.3 KiB
Python
from bs4 import BeautifulSoup
|
|
import requests
|
|
from urllib.parse import urljoin
|
|
from openai import OpenAI
|
|
import os
|
|
from langchain_openai import OpenAIEmbeddings
|
|
from db_management import (insert_data ,is_similar_data ,get_all_links,cleansing )
|
|
import json
|
|
from dotenv import load_dotenv
|
|
import tiktoken
|
|
from json_repair import repair_json
|
|
|
|
load_dotenv()
|
|
cleansing()
|
|
|
|
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
|
|
client = OpenAI()
|
|
embeddings = OpenAIEmbeddings()
|
|
|
|
dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info','https://www.index.hr', 'https://avaz.ba', 'https://www.telegraf.rs', 'https://www.blic.rs', 'https://www.vijesti.me','https://dnevnik.hr','https://24sata.hr']
|
|
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
|
|
|
|
def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:
|
|
encoding = tiktoken.encoding_for_model(model)
|
|
return len(encoding.encode(string))
|
|
|
|
def slice_text_at_2k_tokens(text):
|
|
encoding_name = "gpt-3.5-turbo"
|
|
max_tokens = 1950
|
|
encoding = tiktoken.encoding_for_model(encoding_name)
|
|
tokens = encoding.encode(text)
|
|
if len(tokens) <= max_tokens:
|
|
return [text]
|
|
sliced_tokens = tokens[:max_tokens]
|
|
sliced_text = encoding.decode(sliced_tokens)
|
|
return sliced_text
|
|
|
|
def slice_title_if_needed(text):
|
|
encoding_name = "gpt-3.5-turbo"
|
|
max_tokens = 100
|
|
encoding = tiktoken.encoding_for_model(encoding_name)
|
|
tokens = encoding.encode(text)
|
|
if len(tokens) <= max_tokens:
|
|
return [text]
|
|
sliced_tokens = tokens[:max_tokens]
|
|
sliced_text = encoding.decode(sliced_tokens)
|
|
return sliced_text
|
|
|
|
def replace_with_spaces(text):
|
|
allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 "
|
|
cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)
|
|
return cleaned_text
|
|
|
|
def fix_links(links_set):
|
|
modified_links = set()
|
|
for link in links_set:
|
|
if "www" in link:
|
|
modified_link = link.replace("www.", "")
|
|
modified_links.add(modified_link)
|
|
else:
|
|
modified_links.add(link)
|
|
return modified_links
|
|
|
|
total_links = set()
|
|
collected_news = set()
|
|
|
|
def get_article_links(url, already_checked):
|
|
response = requests.get(url,headers)
|
|
if response.status_code == 200:
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
articles = soup.find_all('article')
|
|
link_store = []
|
|
|
|
for article in articles:
|
|
links = article.find_all('a', href=True)
|
|
for link in links:
|
|
link_value = urljoin(url, link['href'])
|
|
if link_value not in already_checked:
|
|
link_store.append(link_value)
|
|
already_checked.add(link_value)
|
|
return link_store
|
|
|
|
already_checked = set()
|
|
|
|
for dlink in dlinks:
|
|
temp_links = get_article_links(dlink, already_checked)
|
|
if temp_links:
|
|
total_links.update(temp_links)
|
|
final_links = {item for item in total_links if item}
|
|
|
|
db_links = set(get_all_links())
|
|
new_links = final_links - db_links
|
|
final_links = new_links
|
|
final_links = set(final_links)
|
|
final_links = fix_links(final_links)
|
|
|
|
if __name__ == '__main__':
|
|
|
|
for link in final_links:
|
|
if link not in db_links:
|
|
print(f"Processing link: {link}")
|
|
db_links.add(link)
|
|
|
|
response = requests.get(link,headers)
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
titles = soup.find_all(['h2', 'h1','h3'])
|
|
title_text = ' '.join([title.get_text(strip=True) for title in titles])
|
|
|
|
texts = soup.find_all(['p'])
|
|
text_text = ' '.join([text.get_text(strip=True) for text in texts])
|
|
|
|
text_text = text_text
|
|
title_text = title_text
|
|
|
|
title_text = replace_with_spaces(title_text)
|
|
|
|
text_text = slice_text_at_2k_tokens(text_text)
|
|
text_text = replace_with_spaces(str(text_text))
|
|
|
|
ttk = num_tokens_from_string(text_text)
|
|
|
|
category_options = ['politics','business','sport','magazine','scitech']
|
|
|
|
category_translation = {
|
|
'politics': 'Politika',
|
|
'business': 'Biznis',
|
|
'sport': 'Sport',
|
|
'magazine': 'Magazin',
|
|
'scitech': 'Nauka i tehnologija',
|
|
'other': 'Ostalo',
|
|
}
|
|
|
|
if ttk > 1900:
|
|
title_text = slice_title_if_needed(title_text)
|
|
try:
|
|
completion = client.chat.completions.create(
|
|
model="gpt-3.5-turbo",
|
|
messages=[
|
|
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
|
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title and remove 'FOTO' and 'VIDEO' from title and text, from {category_options} select category in wich that news belong, and provide the cleaned data make sure that its on Bosnian language and valid JSON object with 'title' field, 'category' and 'content' field."}
|
|
])
|
|
generated_text = completion.choices[0].message.content
|
|
|
|
generated_text = repair_json(generated_text)
|
|
|
|
response_data = json.loads(generated_text)
|
|
title = response_data["title"]
|
|
predicted_category = response_data["category"]
|
|
text = response_data["content"]
|
|
|
|
if predicted_category.lower() in category_options:
|
|
category = predicted_category.lower()
|
|
else:
|
|
category = 'other'
|
|
|
|
category = category_translation.get(category, category.capitalize())
|
|
|
|
vector = embeddings.embed_query(generated_text)
|
|
|
|
print(f"Category: {category}")
|
|
|
|
if not is_similar_data(title, text, link, vector, threshold=0.98):
|
|
similar_d = "NO"
|
|
insert_data(title, text, link, vector,similar_d,category)
|
|
|
|
except Exception as e:
|
|
print(f"Error in completion: {e}")
|
|
continue
|
|
|