Fixed response/JSON
This commit is contained in:
Binary file not shown.
Binary file not shown.
@@ -7,6 +7,7 @@ from langchain.embeddings import OpenAIEmbeddings
|
|||||||
from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings
|
from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings
|
||||||
from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens
|
from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens
|
||||||
import json
|
import json
|
||||||
|
from json_repair import repair_json
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
@@ -16,7 +17,6 @@ embeddings = OpenAIEmbeddings()
|
|||||||
|
|
||||||
print(f"Checking for similar!")
|
print(f"Checking for similar!")
|
||||||
|
|
||||||
|
|
||||||
def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
|
def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
|
||||||
try:
|
try:
|
||||||
titles, links, embeddings = get_titles_links_embeddings()
|
titles, links, embeddings = get_titles_links_embeddings()
|
||||||
@@ -142,7 +142,6 @@ def processing_similar():
|
|||||||
else:
|
else:
|
||||||
user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
|
user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
|
||||||
if l == 4:
|
if l == 4:
|
||||||
print("4")
|
|
||||||
a_one = articles[0][0]
|
a_one = articles[0][0]
|
||||||
a_two = articles[1][0]
|
a_two = articles[1][0]
|
||||||
a_three = articles[2][0]
|
a_three = articles[2][0]
|
||||||
@@ -220,6 +219,8 @@ def processing_similar():
|
|||||||
])
|
])
|
||||||
generated_text = completion.choices[0].message.content
|
generated_text = completion.choices[0].message.content
|
||||||
|
|
||||||
|
generated_text = repair_json(generated_text)
|
||||||
|
|
||||||
response_data = json.loads(generated_text)
|
response_data = json.loads(generated_text)
|
||||||
title = a_one
|
title = a_one
|
||||||
text = response_data["content"]
|
text = response_data["content"]
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ from vectData import (insert_data ,is_similar_data ,get_all_links,cleansing )
|
|||||||
import json
|
import json
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import tiktoken
|
import tiktoken
|
||||||
|
from json_repair import repair_json
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
cleansing()
|
cleansing()
|
||||||
@@ -21,50 +21,50 @@ embeddings = OpenAIEmbeddings()
|
|||||||
dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
|
dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
|
||||||
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
|
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:
|
def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:
|
||||||
encoding = tiktoken.encoding_for_model(model)
|
encoding = tiktoken.encoding_for_model(model)
|
||||||
return len(encoding.encode(string))
|
return len(encoding.encode(string))
|
||||||
|
|
||||||
def slice_text_at_2k_tokens(text):
|
def slice_text_at_2k_tokens(text):
|
||||||
encoding_name = "gpt-3.5-turbo"
|
encoding_name = "gpt-3.5-turbo"
|
||||||
max_tokens = 2000
|
max_tokens = 1950
|
||||||
|
|
||||||
encoding = tiktoken.encoding_for_model(encoding_name)
|
encoding = tiktoken.encoding_for_model(encoding_name)
|
||||||
tokens = encoding.encode(text)
|
tokens = encoding.encode(text)
|
||||||
|
|
||||||
if len(tokens) <= max_tokens:
|
if len(tokens) <= max_tokens:
|
||||||
return [text]
|
return [text]
|
||||||
|
|
||||||
sliced_tokens = tokens[:max_tokens]
|
sliced_tokens = tokens[:max_tokens]
|
||||||
sliced_text = encoding.decode(sliced_tokens)
|
sliced_text = encoding.decode(sliced_tokens)
|
||||||
|
|
||||||
return sliced_text
|
return sliced_text
|
||||||
|
|
||||||
|
def slice_title_if_needed(text):
|
||||||
|
encoding_name = "gpt-3.5-turbo"
|
||||||
|
max_tokens = 100
|
||||||
|
encoding = tiktoken.encoding_for_model(encoding_name)
|
||||||
|
tokens = encoding.encode(text)
|
||||||
|
if len(tokens) <= max_tokens:
|
||||||
|
return [text]
|
||||||
|
sliced_tokens = tokens[:max_tokens]
|
||||||
|
sliced_text = encoding.decode(sliced_tokens)
|
||||||
|
return sliced_text
|
||||||
|
|
||||||
def replace_with_spaces(text):
|
def replace_with_spaces(text):
|
||||||
allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 "
|
allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 "
|
||||||
cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)
|
cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)
|
||||||
return cleaned_text
|
return cleaned_text
|
||||||
|
|
||||||
|
|
||||||
def fix_links(links_set):
|
def fix_links(links_set):
|
||||||
modified_links = set()
|
modified_links = set()
|
||||||
|
|
||||||
for link in links_set:
|
for link in links_set:
|
||||||
if "www" in link:
|
if "www" in link:
|
||||||
modified_link = link.replace("www.", "")
|
modified_link = link.replace("www.", "")
|
||||||
modified_links.add(modified_link)
|
modified_links.add(modified_link)
|
||||||
else:
|
else:
|
||||||
modified_links.add(link)
|
modified_links.add(link)
|
||||||
|
|
||||||
return modified_links
|
return modified_links
|
||||||
|
|
||||||
total_links = set()
|
total_links = set()
|
||||||
collected_news = set()
|
collected_news = set()
|
||||||
|
|
||||||
|
|
||||||
def get_article_links(url, already_checked):
|
def get_article_links(url, already_checked):
|
||||||
response = requests.get(url,headers)
|
response = requests.get(url,headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
@@ -81,25 +81,22 @@ def get_article_links(url, already_checked):
|
|||||||
already_checked.add(link_value)
|
already_checked.add(link_value)
|
||||||
return link_store
|
return link_store
|
||||||
|
|
||||||
|
|
||||||
already_checked = set()
|
already_checked = set()
|
||||||
|
|
||||||
for dlink in dlinks:
|
for dlink in dlinks:
|
||||||
temp_links = get_article_links(dlink, already_checked)
|
temp_links = get_article_links(dlink, already_checked)
|
||||||
if temp_links:
|
if temp_links:
|
||||||
total_links.update(temp_links)
|
total_links.update(temp_links)
|
||||||
|
|
||||||
final_links = {item for item in total_links if item}
|
final_links = {item for item in total_links if item}
|
||||||
|
|
||||||
db_links = set(get_all_links())
|
db_links = set(get_all_links())
|
||||||
new_links = final_links - db_links
|
new_links = final_links - db_links
|
||||||
final_links = new_links
|
final_links = new_links
|
||||||
final_links = set(final_links)
|
final_links = set(final_links)
|
||||||
|
|
||||||
final_links = fix_links(final_links)
|
final_links = fix_links(final_links)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
for link in final_links:
|
for link in final_links:
|
||||||
response = requests.get(link,headers)
|
response = requests.get(link,headers)
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
@@ -117,24 +114,26 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
text_text = slice_text_at_2k_tokens(text_text)
|
text_text = slice_text_at_2k_tokens(text_text)
|
||||||
text_text = replace_with_spaces(str(text_text))
|
text_text = replace_with_spaces(str(text_text))
|
||||||
|
|
||||||
|
ttk = num_tokens_from_string(text_text)
|
||||||
|
|
||||||
|
if ttk > 1900:
|
||||||
|
title_text = slice_title_if_needed(title_text)
|
||||||
try:
|
try:
|
||||||
completion = client.chat.completions.create(
|
completion = client.chat.completions.create(
|
||||||
model="gpt-3.5-turbo",
|
model="gpt-3.5-turbo",
|
||||||
messages=[
|
messages=[
|
||||||
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
||||||
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
|
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data make sure that its valid JSON object with 'title' field and 'content' field."}
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
generated_text = completion.choices[0].message.content
|
generated_text = completion.choices[0].message.content
|
||||||
|
|
||||||
generated_text = generated_text
|
generated_text = repair_json(generated_text)
|
||||||
|
|
||||||
response_data = json.loads(generated_text)
|
response_data = json.loads(generated_text)
|
||||||
|
|
||||||
title = response_data["title"]
|
title = response_data["title"]
|
||||||
text = response_data["content"]
|
text = response_data["content"]
|
||||||
|
|
||||||
vector = embeddings.embed_query(generated_text)
|
vector = embeddings.embed_query(generated_text)
|
||||||
|
|
||||||
if not is_similar_data(title, text, link, vector, threshold=0.98):
|
if not is_similar_data(title, text, link, vector, threshold=0.98):
|
||||||
|
|||||||
Reference in New Issue
Block a user