Combine similar article
This commit is contained in:
@@ -1,15 +1,20 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from urllib.parse import urljoin
|
||||
from openai import OpenAI
|
||||
from openai import OpenAI , APIError
|
||||
import os
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from langchain.vectorstores.pgvector import PGVector
|
||||
from vectData import insert_data ,is_similar_data
|
||||
from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data, delete_specific,get_all_links,cleansing ,modify_similar_data)
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
import tiktoken
|
||||
|
||||
|
||||
os.environ["OPENAI_API_KEY"] = "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"
|
||||
load_dotenv()
|
||||
cleansing()
|
||||
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
||||
|
||||
client = OpenAI()
|
||||
embeddings = OpenAIEmbeddings()
|
||||
|
||||
@@ -17,9 +22,36 @@ dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
|
||||
|
||||
|
||||
|
||||
def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:
|
||||
encoding = tiktoken.encoding_for_model(model)
|
||||
return len(encoding.encode(string))
|
||||
|
||||
def slice_text_at_2k_tokens(text):
|
||||
encoding_name = "gpt-3.5-turbo"
|
||||
max_tokens = 2000
|
||||
|
||||
encoding = tiktoken.encoding_for_model(encoding_name)
|
||||
tokens = encoding.encode(text)
|
||||
|
||||
if len(tokens) <= max_tokens:
|
||||
return [text]
|
||||
|
||||
sliced_tokens = tokens[:max_tokens]
|
||||
sliced_text = encoding.decode(sliced_tokens)
|
||||
|
||||
return sliced_text
|
||||
|
||||
|
||||
def replace_with_spaces(text):
|
||||
allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 "
|
||||
cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)
|
||||
return cleaned_text
|
||||
|
||||
total_links = set()
|
||||
collected_news = set()
|
||||
|
||||
|
||||
def get_article_links(url, already_checked):
|
||||
response = requests.get(url,headers)
|
||||
if response.status_code == 200:
|
||||
@@ -36,6 +68,8 @@ def get_article_links(url, already_checked):
|
||||
already_checked.add(link_value)
|
||||
return link_store
|
||||
|
||||
|
||||
|
||||
already_checked = set()
|
||||
|
||||
for dlink in dlinks:
|
||||
@@ -44,8 +78,17 @@ for dlink in dlinks:
|
||||
total_links.update(temp_links)
|
||||
|
||||
final_links = {item for item in total_links if item}
|
||||
i = 0
|
||||
|
||||
for link in final_links:
|
||||
db_links = set(get_all_links())
|
||||
new_links = final_links - db_links
|
||||
final_links = new_links
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
for link in final_links:
|
||||
response = requests.get(link,headers)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
@@ -54,6 +97,16 @@ for link in final_links:
|
||||
|
||||
texts = soup.find_all(['p'])
|
||||
text_text = ' '.join([text.get_text(strip=True) for text in texts])
|
||||
|
||||
text_text = text_text
|
||||
title_text = title_text
|
||||
|
||||
title_text = replace_with_spaces(title_text)
|
||||
|
||||
|
||||
print(f"Tokens usage: {num_tokens_from_string(text_text, 'gpt-3.5-turbo')}")
|
||||
text_text = slice_text_at_2k_tokens(text_text)
|
||||
text_text = replace_with_spaces(str(text_text))
|
||||
|
||||
try:
|
||||
completion = client.chat.completions.create(
|
||||
@@ -65,23 +118,130 @@ for link in final_links:
|
||||
)
|
||||
generated_text = completion.choices[0].message.content
|
||||
|
||||
generated_text = generated_text
|
||||
|
||||
response_data = json.loads(generated_text)
|
||||
|
||||
title = response_data["title"]
|
||||
text = response_data["content"]
|
||||
|
||||
print("*********************************")
|
||||
print(f"Title: {title}")
|
||||
print("---------------------------------")
|
||||
print(f"Content : {text}")
|
||||
print("*********************************")
|
||||
#print("*********************************")
|
||||
#print(f"Title: {title}")
|
||||
#print("---------------------------------")
|
||||
#print(f"Content : {text}")
|
||||
#print("*********************************")
|
||||
|
||||
|
||||
vector = embeddings.embed_query(generated_text)
|
||||
|
||||
if not is_similar_data(title, text, link, vector, threshold=0.9):
|
||||
insert_data(title, text, link, vector)
|
||||
|
||||
if not is_similar_data(title, text, link, vector, threshold=0.98):
|
||||
similar_d = "NO"
|
||||
insert_data(title, text, link, vector,similar_d)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in completion: {e}")
|
||||
continue
|
||||
|
||||
def comb_similar():
|
||||
|
||||
print("Checking similar")
|
||||
similar_article = get_similar()
|
||||
|
||||
grouped_data = {}
|
||||
|
||||
|
||||
for sa in similar_article:
|
||||
if similar_article:
|
||||
first_t = get_specific_data(sa[0])
|
||||
second_t = get_specific_data(sa[1])
|
||||
link_f = first_t[0][2]
|
||||
link_s = second_t[0][2]
|
||||
f_text = first_t[0][1]
|
||||
s_text = second_t[0][1]
|
||||
f_title = first_t[0][0]
|
||||
s_title = second_t[0][0]
|
||||
|
||||
if f_title in grouped_data:
|
||||
grouped_data[f_title].append((f_text, link_f))
|
||||
else:
|
||||
grouped_data[f_title] = [(f_text, link_f)]
|
||||
|
||||
if s_title in grouped_data:
|
||||
grouped_data[s_title].append((s_text, link_s))
|
||||
else:
|
||||
grouped_data[s_title] = [(s_text, link_s)]
|
||||
|
||||
for title, tuples in grouped_data.items():
|
||||
if len(tuples) == 3:
|
||||
text1, link1 = tuples[0]
|
||||
text2, link2 = tuples[1]
|
||||
text3, link3 = tuples[2]
|
||||
|
||||
t1check = num_tokens_from_string(text1)
|
||||
t2check = num_tokens_from_string(text2)
|
||||
t3check = num_tokens_from_string(text3)
|
||||
slice_if_more = t1check,t2check,t3check
|
||||
if slice_if_more < 2000:
|
||||
combined_text = f"{text1}{text2}{text3}"
|
||||
combined_text = slice_text_at_2k_tokens(combined_text)
|
||||
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with 'content' field"
|
||||
link = f"{link1} {link2} {link3}"
|
||||
|
||||
else:
|
||||
user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field."
|
||||
link = f"{link1} {link2} {link3}"
|
||||
|
||||
else:
|
||||
ftcheck = num_tokens_from_string(f_text)
|
||||
stcheck = num_tokens_from_string(s_text)
|
||||
fscomb = ftcheck + stcheck
|
||||
if fscomb <2000:
|
||||
combined_text = f"{f_text}{s_text}"
|
||||
user_message = rf"Here is text {combined_text}, combined from 2 sources, filter text, and make news content, return as JSON only with 'content' field"
|
||||
link = f"{link_f} {link_s}"
|
||||
|
||||
else:
|
||||
user_message = rf"Here are 2 texts {f_text} and {s_text}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field."
|
||||
link = f"{link_f} {link_s}"
|
||||
|
||||
try:
|
||||
completion = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
||||
{"role": "user", "content": user_message}
|
||||
]
|
||||
)
|
||||
generated_text = completion.choices[0].message.content
|
||||
generated_text = generated_text
|
||||
|
||||
if similar_article:
|
||||
if f_title == s_title:
|
||||
print(f_title)
|
||||
modify_similar_data(first_t,"SOURCE")
|
||||
similar_article.remove(sa)
|
||||
print("Modified")
|
||||
else:
|
||||
print(f"Second: {s_title}")
|
||||
modify_similar_data(first_t,"SOURCE")
|
||||
modify_similar_data(second_t,"SOURCE")
|
||||
similar_article.remove(sa)
|
||||
print("Modified")
|
||||
else:
|
||||
print("Similar list is empty")
|
||||
|
||||
response_data = json.loads(generated_text)
|
||||
title = f_title
|
||||
text = response_data["content"]
|
||||
|
||||
vector = embeddings.embed_query(generated_text)
|
||||
|
||||
if not is_similar_data(title, text, link, vector, threshold=0.98):
|
||||
similar_d = "NO"
|
||||
insert_data(title, text, link, vector, similar_d)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in completion: {e}")
|
||||
continue
|
||||
|
||||
comb_similar()
|
||||
Reference in New Issue
Block a user