organizing code
This commit is contained in:
@@ -1,10 +1,10 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from urllib.parse import urljoin
|
||||
from openai import OpenAI , APIError
|
||||
from openai import OpenAI
|
||||
import os
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data,get_all_links,cleansing ,modify_similar_data)
|
||||
from vectData import (insert_data ,is_similar_data ,get_all_links,cleansing )
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
import tiktoken
|
||||
@@ -39,7 +39,7 @@ def slice_text_at_2k_tokens(text):
|
||||
|
||||
sliced_tokens = tokens[:max_tokens]
|
||||
sliced_text = encoding.decode(sliced_tokens)
|
||||
|
||||
|
||||
return sliced_text
|
||||
|
||||
|
||||
@@ -82,7 +82,6 @@ def get_article_links(url, already_checked):
|
||||
return link_store
|
||||
|
||||
|
||||
|
||||
already_checked = set()
|
||||
|
||||
for dlink in dlinks:
|
||||
@@ -116,8 +115,6 @@ if __name__ == '__main__':
|
||||
|
||||
title_text = replace_with_spaces(title_text)
|
||||
|
||||
|
||||
print(f"Tokens usage: {num_tokens_from_string(text_text, 'gpt-3.5-turbo')}")
|
||||
text_text = slice_text_at_2k_tokens(text_text)
|
||||
text_text = replace_with_spaces(str(text_text))
|
||||
|
||||
@@ -138,13 +135,6 @@ if __name__ == '__main__':
|
||||
title = response_data["title"]
|
||||
text = response_data["content"]
|
||||
|
||||
#print("*********************************")
|
||||
#print(f"Title: {title}")
|
||||
#print("---------------------------------")
|
||||
#print(f"Content : {text}")
|
||||
#print("*********************************")
|
||||
|
||||
|
||||
vector = embeddings.embed_query(generated_text)
|
||||
|
||||
if not is_similar_data(title, text, link, vector, threshold=0.98):
|
||||
|
||||
Reference in New Issue
Block a user