organizing code

This commit is contained in:
2024-01-07 03:41:32 +01:00
parent 96a2d88895
commit b7a0e5478c
5 changed files with 122 additions and 235 deletions

View File

@@ -1,10 +1,10 @@
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
from openai import OpenAI , APIError
from openai import OpenAI
import os
from langchain.embeddings import OpenAIEmbeddings
from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data,get_all_links,cleansing ,modify_similar_data)
from vectData import (insert_data ,is_similar_data ,get_all_links,cleansing )
import json
from dotenv import load_dotenv
import tiktoken
@@ -39,7 +39,7 @@ def slice_text_at_2k_tokens(text):
sliced_tokens = tokens[:max_tokens]
sliced_text = encoding.decode(sliced_tokens)
return sliced_text
@@ -82,7 +82,6 @@ def get_article_links(url, already_checked):
return link_store
already_checked = set()
for dlink in dlinks:
@@ -116,8 +115,6 @@ if __name__ == '__main__':
title_text = replace_with_spaces(title_text)
print(f"Tokens usage: {num_tokens_from_string(text_text, 'gpt-3.5-turbo')}")
text_text = slice_text_at_2k_tokens(text_text)
text_text = replace_with_spaces(str(text_text))
@@ -138,13 +135,6 @@ if __name__ == '__main__':
title = response_data["title"]
text = response_data["content"]
#print("*********************************")
#print(f"Title: {title}")
#print("---------------------------------")
#print(f"Content : {text}")
#print("*********************************")
vector = embeddings.embed_query(generated_text)
if not is_similar_data(title, text, link, vector, threshold=0.98):