import unittest from unittest.mock import patch import requests from bs4 import BeautifulSoup from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores.pgvector import PGVector from openai import OpenAI import json from dotenv import load_dotenv from pyth.get_articles import get_article_links, insert_data, is_similar_data import os load_dotenv() OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") client = OpenAI() embeddings = OpenAIEmbeddings() already_checked = set() total_links = set() collected_news = set() dlinks = 'http://127.0.0.1:5000/' class TestIntegration(unittest.TestCase): def test_integration(self): link = get_article_links(dlinks,already_checked) self.assertEqual(len(already_checked), 2) for link in total_links: response = requests.get(link) soup = BeautifulSoup(response.text, 'html.parser') titles = soup.find_all(['h2', 'h1', 'h3']) title_text = ' '.join([title.get_text(strip=True) for title in titles]) texts = soup.find_all(['p']) text_text = ' '.join([text.get_text(strip=True) for text in texts]) completion = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "Data analytic, Journalist and News reporter"}, {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."} ] ) generated_text = completion.choices[0].message.content response_data = json.loads(generated_text) title = response_data["title"] text = response_data["content"] vector = embeddings.embed_query(generated_text) self.assertIn("Test Title", title) self.assertIn("Test Text", text) self.assertEqual(len(total_links), 2)