61 lines
2.1 KiB
Python
61 lines
2.1 KiB
Python
import unittest
|
|
from unittest.mock import patch
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from langchain.embeddings import OpenAIEmbeddings
|
|
from langchain.vectorstores.pgvector import PGVector
|
|
from openai import OpenAI
|
|
import json
|
|
from dotenv import load_dotenv
|
|
from pyth.get_articles import get_article_links, insert_data, is_similar_data
|
|
import os
|
|
|
|
load_dotenv()
|
|
|
|
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
client = OpenAI()
|
|
embeddings = OpenAIEmbeddings()
|
|
|
|
|
|
already_checked = set()
|
|
total_links = set()
|
|
collected_news = set()
|
|
dlinks = 'http://127.0.0.1:5000/'
|
|
|
|
class TestIntegration(unittest.TestCase):
|
|
|
|
|
|
def test_integration(self):
|
|
link = get_article_links(dlinks,already_checked)
|
|
self.assertEqual(len(already_checked), 2)
|
|
|
|
for link in total_links:
|
|
response = requests.get(link)
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
titles = soup.find_all(['h2', 'h1', 'h3'])
|
|
title_text = ' '.join([title.get_text(strip=True) for title in titles])
|
|
|
|
texts = soup.find_all(['p'])
|
|
text_text = ' '.join([text.get_text(strip=True) for text in texts])
|
|
|
|
completion = client.chat.completions.create(
|
|
model="gpt-3.5-turbo",
|
|
messages=[
|
|
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
|
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
|
|
]
|
|
)
|
|
generated_text = completion.choices[0].message.content
|
|
|
|
response_data = json.loads(generated_text)
|
|
title = response_data["title"]
|
|
text = response_data["content"]
|
|
|
|
vector = embeddings.embed_query(generated_text)
|
|
|
|
self.assertIn("Test Title", title)
|
|
self.assertIn("Test Text", text)
|
|
self.assertEqual(len(total_links), 2)
|
|
|