2023-12-25 12:31:55 +01:00
from bs4 import BeautifulSoup
import requests
from urllib . parse import urljoin
2024-01-02 15:00:07 +01:00
from openai import OpenAI , APIError
2023-12-25 12:31:55 +01:00
import os
from langchain . embeddings import OpenAIEmbeddings
2024-01-06 08:17:05 +01:00
from vectData import ( insert_data , is_similar_data , get_similar , get_specific_data , get_all_links , cleansing , modify_similar_data )
2023-12-25 12:31:55 +01:00
import json
2024-01-02 15:00:07 +01:00
from dotenv import load_dotenv
import tiktoken
2023-12-25 12:31:55 +01:00
2024-01-02 15:00:07 +01:00
load_dotenv ( )
cleansing ( )
OPENAI_API_KEY = os . getenv ( " OPENAI_API_KEY " )
2023-12-25 12:31:55 +01:00
client = OpenAI ( )
embeddings = OpenAIEmbeddings ( )
dlinks = [ ' https://klix.ba ' , ' https://srpskainfo.com ' , ' https://bljesak.info ' ]
headers = { ' User-Agent ' : ' Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36 ' }
2024-01-02 15:00:07 +01:00
def num_tokens_from_string ( string : str , model = " gpt-3.5-turbo " ) - > int :
encoding = tiktoken . encoding_for_model ( model )
return len ( encoding . encode ( string ) )
def slice_text_at_2k_tokens ( text ) :
encoding_name = " gpt-3.5-turbo "
max_tokens = 2000
encoding = tiktoken . encoding_for_model ( encoding_name )
tokens = encoding . encode ( text )
if len ( tokens ) < = max_tokens :
return [ text ]
sliced_tokens = tokens [ : max_tokens ]
sliced_text = encoding . decode ( sliced_tokens )
return sliced_text
def replace_with_spaces ( text ) :
allowed_chars = " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 "
cleaned_text = ' ' . join ( char if char in allowed_chars else ' ' for char in text )
return cleaned_text
2024-01-06 08:17:05 +01:00
def fix_links ( links_set ) :
modified_links = set ( )
for link in links_set :
if " www " in link :
modified_link = link . replace ( " www. " , " " )
modified_links . add ( modified_link )
else :
modified_links . add ( link )
return modified_links
2023-12-25 12:31:55 +01:00
total_links = set ( )
collected_news = set ( )
2024-01-02 15:00:07 +01:00
2023-12-25 12:31:55 +01:00
def get_article_links ( url , already_checked ) :
response = requests . get ( url , headers )
if response . status_code == 200 :
soup = BeautifulSoup ( response . text , ' html.parser ' )
articles = soup . find_all ( ' article ' )
link_store = [ ]
for article in articles :
links = article . find_all ( ' a ' , href = True )
for link in links :
link_value = urljoin ( url , link [ ' href ' ] )
if link_value not in already_checked :
link_store . append ( link_value )
already_checked . add ( link_value )
return link_store
2024-01-02 15:00:07 +01:00
2023-12-25 12:31:55 +01:00
already_checked = set ( )
for dlink in dlinks :
temp_links = get_article_links ( dlink , already_checked )
if temp_links :
total_links . update ( temp_links )
final_links = { item for item in total_links if item }
2024-01-02 15:00:07 +01:00
db_links = set ( get_all_links ( ) )
new_links = final_links - db_links
final_links = new_links
2024-01-06 08:17:05 +01:00
final_links = set ( final_links )
2024-01-02 15:00:07 +01:00
2024-01-06 08:17:05 +01:00
final_links = fix_links ( final_links )
2023-12-25 12:31:55 +01:00
2024-01-02 15:00:07 +01:00
if __name__ == ' __main__ ' :
for link in final_links :
2023-12-25 12:31:55 +01:00
response = requests . get ( link , headers )
soup = BeautifulSoup ( response . text , ' html.parser ' )
titles = soup . find_all ( [ ' h2 ' , ' h1 ' , ' h3 ' ] )
title_text = ' ' . join ( [ title . get_text ( strip = True ) for title in titles ] )
texts = soup . find_all ( [ ' p ' ] )
text_text = ' ' . join ( [ text . get_text ( strip = True ) for text in texts ] )
2024-01-02 15:00:07 +01:00
text_text = text_text
title_text = title_text
title_text = replace_with_spaces ( title_text )
print ( f " Tokens usage: { num_tokens_from_string ( text_text , ' gpt-3.5-turbo ' ) } " )
text_text = slice_text_at_2k_tokens ( text_text )
text_text = replace_with_spaces ( str ( text_text ) )
2023-12-25 12:31:55 +01:00
try :
completion = client . chat . completions . create (
model = " gpt-3.5-turbo " ,
messages = [
{ " role " : " system " , " content " : " Data analytic, Journalist and News reporter " } ,
{ " role " : " user " , " content " : rf " Extract relevant information from the following input: Title: { title_text } , Text: { text_text } . Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with ' title ' and ' content ' fields. " }
]
)
generated_text = completion . choices [ 0 ] . message . content
2024-01-02 15:00:07 +01:00
generated_text = generated_text
2023-12-25 12:31:55 +01:00
response_data = json . loads ( generated_text )
title = response_data [ " title " ]
text = response_data [ " content " ]
2024-01-02 15:00:07 +01:00
#print("*********************************")
#print(f"Title: {title}")
#print("---------------------------------")
#print(f"Content : {text}")
#print("*********************************")
2023-12-25 12:31:55 +01:00
vector = embeddings . embed_query ( generated_text )
2024-01-02 15:00:07 +01:00
if not is_similar_data ( title , text , link , vector , threshold = 0.98 ) :
similar_d = " NO "
insert_data ( title , text , link , vector , similar_d )
2023-12-25 12:31:55 +01:00
except Exception as e :
print ( f " Error in completion: { e } " )
continue
2024-01-02 15:00:07 +01:00
2024-01-06 08:17:05 +01:00
2024-01-02 15:00:07 +01:00
def comb_similar ( ) :
print ( " Checking similar " )
similar_article = get_similar ( )
grouped_data = { }
for sa in similar_article :
if similar_article :
first_t = get_specific_data ( sa [ 0 ] )
second_t = get_specific_data ( sa [ 1 ] )
link_f = first_t [ 0 ] [ 2 ]
link_s = second_t [ 0 ] [ 2 ]
f_text = first_t [ 0 ] [ 1 ]
s_text = second_t [ 0 ] [ 1 ]
f_title = first_t [ 0 ] [ 0 ]
s_title = second_t [ 0 ] [ 0 ]
if f_title in grouped_data :
grouped_data [ f_title ] . append ( ( f_text , link_f ) )
else :
grouped_data [ f_title ] = [ ( f_text , link_f ) ]
if s_title in grouped_data :
grouped_data [ s_title ] . append ( ( s_text , link_s ) )
else :
grouped_data [ s_title ] = [ ( s_text , link_s ) ]
for title , tuples in grouped_data . items ( ) :
if len ( tuples ) == 3 :
text1 , link1 = tuples [ 0 ]
text2 , link2 = tuples [ 1 ]
text3 , link3 = tuples [ 2 ]
t1check = num_tokens_from_string ( text1 )
t2check = num_tokens_from_string ( text2 )
t3check = num_tokens_from_string ( text3 )
slice_if_more = t1check , t2check , t3check
if slice_if_more < 2000 :
combined_text = f " { text1 } { text2 } { text3 } "
combined_text = slice_text_at_2k_tokens ( combined_text )
user_message = rf " Here is text { combined_text } , combined from 3 sources, filter text, and make news content, return as JSON only with ' content ' field "
2024-01-06 08:17:05 +01:00
if link1 != link2 and link1 != link3 and link2 != link3 :
link = f " { link1 } { link2 } { link3 } "
else :
link = link1
2024-01-02 15:00:07 +01:00
else :
user_message = rf " Here are 3 texts { text1 } { text2 } and { text3 } , combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with ' content ' field. "
2024-01-06 08:17:05 +01:00
if link1 != link2 and link1 != link3 and link2 != link3 :
link = f " { link1 } { link2 } { link3 } "
else :
link = link1
2024-01-02 15:00:07 +01:00
else :
ftcheck = num_tokens_from_string ( f_text )
stcheck = num_tokens_from_string ( s_text )
fscomb = ftcheck + stcheck
if fscomb < 2000 :
combined_text = f " { f_text } { s_text } "
user_message = rf " Here is text { combined_text } , combined from 2 sources, filter text, and make news content, return as JSON only with ' content ' field "
2024-01-06 08:17:05 +01:00
if link_f != link_s :
link = f " { link_f } { link_s } "
else :
link = link_f
2024-01-02 15:00:07 +01:00
else :
user_message = rf " Here are 2 texts { f_text } and { s_text } , combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with ' content ' field. "
2024-01-06 08:17:05 +01:00
if link_f != link_s :
link = f " { link_f } { link_s } "
else :
link = link_f
2024-01-02 15:00:07 +01:00
try :
completion = client . chat . completions . create (
model = " gpt-3.5-turbo " ,
messages = [
{ " role " : " system " , " content " : " Data analytic, Journalist and News reporter " } ,
{ " role " : " user " , " content " : user_message }
]
)
generated_text = completion . choices [ 0 ] . message . content
if similar_article :
if f_title == s_title :
print ( f_title )
modify_similar_data ( first_t , " SOURCE " )
similar_article . remove ( sa )
print ( " Modified " )
else :
2024-01-06 08:17:05 +01:00
print ( f " First: { f_title } " )
2024-01-02 15:00:07 +01:00
print ( f " Second: { s_title } " )
modify_similar_data ( first_t , " SOURCE " )
modify_similar_data ( second_t , " SOURCE " )
similar_article . remove ( sa )
print ( " Modified " )
else :
print ( " Similar list is empty " )
response_data = json . loads ( generated_text )
title = f_title
text = response_data [ " content " ]
vector = embeddings . embed_query ( generated_text )
if not is_similar_data ( title , text , link , vector , threshold = 0.98 ) :
similar_d = " NO "
insert_data ( title , text , link , vector , similar_d )
except Exception as e :
print ( f " Error in completion: { e } " )
continue