pyth/tests/test_scrapingsingle.py

import unittest
from unittest.mock import patch
import requests
from bs4 import BeautifulSoup
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector
from openai import OpenAI
import json
from dotenv import load_dotenv
from pyth.get_articles import get_article_links, insert_data, is_similar_data
import os

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI()
embeddings = OpenAIEmbeddings()


already_checked = set()
total_links = set()
collected_news = set()
dlinks = 'http://127.0.0.1:5000/'

class TestIntegration(unittest.TestCase):


    def test_integration(self):
        link = get_article_links(dlinks,already_checked)
        self.assertEqual(len(already_checked), 2)

        for link in total_links:
            response = requests.get(link)
            soup = BeautifulSoup(response.text, 'html.parser')

            titles = soup.find_all(['h2', 'h1', 'h3'])
            title_text = ' '.join([title.get_text(strip=True) for title in titles])

            texts = soup.find_all(['p'])
            text_text = ' '.join([text.get_text(strip=True) for text in texts])

            completion = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "Data analytic, Journalist and News reporter"},
                    {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
                ]
            )
            generated_text = completion.choices[0].message.content

            response_data = json.loads(generated_text)
            title = response_data["title"]
            text = response_data["content"]

            vector = embeddings.embed_query(generated_text)

            self.assertIn("Test Title", title)
            self.assertIn("Test Text", text)
            self.assertEqual(len(total_links), 2)
Combine similar article 2024-01-02 15:00:07 +01:00			`import unittest`
			`from unittest.mock import patch`
			`import requests`
			`from bs4 import BeautifulSoup`
			`from langchain.embeddings import OpenAIEmbeddings`
			`from langchain.vectorstores.pgvector import PGVector`
			`from openai import OpenAI`
			`import json`
			`from dotenv import load_dotenv`
Changing from js to golang 2024-01-29 14:55:20 +01:00			`from pyth.get_articles import get_article_links, insert_data, is_similar_data`
Combine similar article 2024-01-02 15:00:07 +01:00			`import os`

			`load_dotenv()`

			`OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")`
			`client = OpenAI()`
			`embeddings = OpenAIEmbeddings()`


			`already_checked = set()`
			`total_links = set()`
			`collected_news = set()`
			`dlinks = 'http://127.0.0.1:5000/'`

			`class TestIntegration(unittest.TestCase):`


			`def test_integration(self):`
			`link = get_article_links(dlinks,already_checked)`
			`self.assertEqual(len(already_checked), 2)`

			`for link in total_links:`
			`response = requests.get(link)`
			`soup = BeautifulSoup(response.text, 'html.parser')`

			`titles = soup.find_all(['h2', 'h1', 'h3'])`
			`title_text = ' '.join([title.get_text(strip=True) for title in titles])`

			`texts = soup.find_all(['p'])`
			`text_text = ' '.join([text.get_text(strip=True) for text in texts])`

			`completion = client.chat.completions.create(`
			`model="gpt-3.5-turbo",`
			`messages=[`
			`{"role": "system", "content": "Data analytic, Journalist and News reporter"},`
			`{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}`
			`]`
			`)`
			`generated_text = completion.choices[0].message.content`

			`response_data = json.loads(generated_text)`
			`title = response_data["title"]`
			`text = response_data["content"]`

			`vector = embeddings.embed_query(generated_text)`

			`self.assertIn("Test Title", title)`
			`self.assertIn("Test Text", text)`
			`self.assertEqual(len(total_links), 2)`