2023-12-25 12:31:55 +01:00
from bs4 import BeautifulSoup
import requests
from urllib . parse import urljoin
2024-01-07 03:41:32 +01:00
from openai import OpenAI
2023-12-25 12:31:55 +01:00
import os
2024-01-29 14:55:20 +01:00
from langchain_openai import OpenAIEmbeddings
from db_management import ( insert_data , is_similar_data , get_all_links , cleansing )
2023-12-25 12:31:55 +01:00
import json
2024-01-02 15:00:07 +01:00
from dotenv import load_dotenv
import tiktoken
2024-01-08 00:28:20 +01:00
from json_repair import repair_json
2023-12-25 12:31:55 +01:00
2024-01-02 15:00:07 +01:00
load_dotenv ( )
cleansing ( )
OPENAI_API_KEY = os . getenv ( " OPENAI_API_KEY " )
2023-12-25 12:31:55 +01:00
client = OpenAI ( )
embeddings = OpenAIEmbeddings ( )
2024-01-29 14:55:20 +01:00
dlinks = [ ' https://klix.ba ' , ' https://srpskainfo.com ' , ' https://bljesak.info ' , ' https://www.index.hr ' , ' https://avaz.ba ' , ' https://www.telegraf.rs ' , ' https://www.blic.rs ' , ' https://www.vijesti.me ' , ' https://dnevnik.hr ' , ' https://24sata.hr ' ]
2023-12-25 12:31:55 +01:00
headers = { ' User-Agent ' : ' Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36 ' }
2024-01-02 15:00:07 +01:00
def num_tokens_from_string ( string : str , model = " gpt-3.5-turbo " ) - > int :
encoding = tiktoken . encoding_for_model ( model )
return len ( encoding . encode ( string ) )
def slice_text_at_2k_tokens ( text ) :
encoding_name = " gpt-3.5-turbo "
2024-01-08 00:28:20 +01:00
max_tokens = 1950
2024-01-02 15:00:07 +01:00
encoding = tiktoken . encoding_for_model ( encoding_name )
tokens = encoding . encode ( text )
if len ( tokens ) < = max_tokens :
return [ text ]
sliced_tokens = tokens [ : max_tokens ]
sliced_text = encoding . decode ( sliced_tokens )
return sliced_text
2024-01-08 00:28:20 +01:00
def slice_title_if_needed ( text ) :
encoding_name = " gpt-3.5-turbo "
max_tokens = 100
encoding = tiktoken . encoding_for_model ( encoding_name )
tokens = encoding . encode ( text )
if len ( tokens ) < = max_tokens :
return [ text ]
sliced_tokens = tokens [ : max_tokens ]
sliced_text = encoding . decode ( sliced_tokens )
return sliced_text
2024-01-02 15:00:07 +01:00
def replace_with_spaces ( text ) :
allowed_chars = " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 "
cleaned_text = ' ' . join ( char if char in allowed_chars else ' ' for char in text )
return cleaned_text
2024-01-06 08:17:05 +01:00
def fix_links ( links_set ) :
modified_links = set ( )
for link in links_set :
if " www " in link :
modified_link = link . replace ( " www. " , " " )
modified_links . add ( modified_link )
else :
modified_links . add ( link )
return modified_links
2023-12-25 12:31:55 +01:00
total_links = set ( )
collected_news = set ( )
def get_article_links ( url , already_checked ) :
response = requests . get ( url , headers )
if response . status_code == 200 :
soup = BeautifulSoup ( response . text , ' html.parser ' )
articles = soup . find_all ( ' article ' )
link_store = [ ]
for article in articles :
links = article . find_all ( ' a ' , href = True )
for link in links :
link_value = urljoin ( url , link [ ' href ' ] )
if link_value not in already_checked :
link_store . append ( link_value )
already_checked . add ( link_value )
return link_store
already_checked = set ( )
for dlink in dlinks :
temp_links = get_article_links ( dlink , already_checked )
if temp_links :
total_links . update ( temp_links )
final_links = { item for item in total_links if item }
2024-01-02 15:00:07 +01:00
db_links = set ( get_all_links ( ) )
new_links = final_links - db_links
final_links = new_links
2024-01-06 08:17:05 +01:00
final_links = set ( final_links )
final_links = fix_links ( final_links )
2023-12-25 12:31:55 +01:00
2024-01-02 15:00:07 +01:00
if __name__ == ' __main__ ' :
2024-01-08 00:28:20 +01:00
2024-01-29 14:55:20 +01:00
for link in final_links :
if link not in db_links :
print ( f " Processing link: { link } " )
db_links . add ( link )
2023-12-25 12:31:55 +01:00
2024-01-29 14:55:20 +01:00
response = requests . get ( link , headers )
soup = BeautifulSoup ( response . text , ' html.parser ' )
2023-12-25 12:31:55 +01:00
2024-01-29 14:55:20 +01:00
titles = soup . find_all ( [ ' h2 ' , ' h1 ' , ' h3 ' ] )
title_text = ' ' . join ( [ title . get_text ( strip = True ) for title in titles ] )
2024-01-02 15:00:07 +01:00
2024-01-29 14:55:20 +01:00
texts = soup . find_all ( [ ' p ' ] )
text_text = ' ' . join ( [ text . get_text ( strip = True ) for text in texts ] )
text_text = text_text
title_text = title_text
2024-01-02 15:00:07 +01:00
2024-01-29 14:55:20 +01:00
title_text = replace_with_spaces ( title_text )
text_text = slice_text_at_2k_tokens ( text_text )
text_text = replace_with_spaces ( str ( text_text ) )
ttk = num_tokens_from_string ( text_text )
category_options = [ ' politics ' , ' business ' , ' sport ' , ' magazine ' , ' scitech ' ]
2024-01-31 12:37:55 +01:00
category_translation = {
' politics ' : ' Politika ' ,
' business ' : ' Biznis ' ,
' sport ' : ' Sport ' ,
' magazine ' : ' Magazin ' ,
' scitech ' : ' Nauka i tehnologija ' ,
' other ' : ' Ostalo ' ,
}
2024-01-29 14:55:20 +01:00
if ttk > 1900 :
title_text = slice_title_if_needed ( title_text )
try :
completion = client . chat . completions . create (
model = " gpt-3.5-turbo " ,
messages = [
{ " role " : " system " , " content " : " Data analytic, Journalist and News reporter " } ,
{ " role " : " user " , " content " : rf " Extract relevant information from the following input: Title: { title_text } , Text: { text_text } . Remove any non-news element related to the current text and title and remove ' FOTO ' and ' VIDEO ' from title and text, from { category_options } select category in wich that news belong, and provide the cleaned data make sure that its on Bosnian language and valid JSON object with ' title ' field, ' category ' and ' content ' field. " }
] )
generated_text = completion . choices [ 0 ] . message . content
generated_text = repair_json ( generated_text )
response_data = json . loads ( generated_text )
title = response_data [ " title " ]
predicted_category = response_data [ " category " ]
text = response_data [ " content " ]
if predicted_category . lower ( ) in category_options :
category = predicted_category . lower ( )
else :
category = ' other '
2024-01-31 12:37:55 +01:00
category = category_translation . get ( category , category . capitalize ( ) )
2024-01-29 14:55:20 +01:00
vector = embeddings . embed_query ( generated_text )
print ( f " Category: { category } " )
2023-12-25 12:31:55 +01:00
2024-01-29 14:55:20 +01:00
if not is_similar_data ( title , text , link , vector , threshold = 0.98 ) :
similar_d = " NO "
insert_data ( title , text , link , vector , similar_d , category )
2024-01-02 15:00:07 +01:00
2024-01-29 14:55:20 +01:00
except Exception as e :
print ( f " Error in completion: { e } " )
continue
2024-01-02 15:00:07 +01:00