Combine similar article
This commit is contained in:
BIN
pyth/tests/__pycache__/test_scrapingsingle.cpython-310.pyc
Normal file
BIN
pyth/tests/__pycache__/test_scrapingsingle.cpython-310.pyc
Normal file
Binary file not shown.
BIN
pyth/tests/__pycache__/test_vectData.cpython-310.pyc
Normal file
BIN
pyth/tests/__pycache__/test_vectData.cpython-310.pyc
Normal file
Binary file not shown.
60
pyth/tests/test_scrapingsingle.py
Normal file
60
pyth/tests/test_scrapingsingle.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from langchain.vectorstores.pgvector import PGVector
|
||||
from openai import OpenAI
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
from scrapingsingle import get_article_links, insert_data, is_similar_data
|
||||
import os
|
||||
|
||||
load_dotenv()
|
||||
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
||||
client = OpenAI()
|
||||
embeddings = OpenAIEmbeddings()
|
||||
|
||||
|
||||
already_checked = set()
|
||||
total_links = set()
|
||||
collected_news = set()
|
||||
dlinks = 'http://127.0.0.1:5000/'
|
||||
|
||||
class TestIntegration(unittest.TestCase):
|
||||
|
||||
|
||||
def test_integration(self):
|
||||
link = get_article_links(dlinks,already_checked)
|
||||
self.assertEqual(len(already_checked), 2)
|
||||
|
||||
for link in total_links:
|
||||
response = requests.get(link)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
titles = soup.find_all(['h2', 'h1', 'h3'])
|
||||
title_text = ' '.join([title.get_text(strip=True) for title in titles])
|
||||
|
||||
texts = soup.find_all(['p'])
|
||||
text_text = ' '.join([text.get_text(strip=True) for text in texts])
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
||||
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
|
||||
]
|
||||
)
|
||||
generated_text = completion.choices[0].message.content
|
||||
|
||||
response_data = json.loads(generated_text)
|
||||
title = response_data["title"]
|
||||
text = response_data["content"]
|
||||
|
||||
vector = embeddings.embed_query(generated_text)
|
||||
|
||||
self.assertIn("Test Title", title)
|
||||
self.assertIn("Test Text", text)
|
||||
self.assertEqual(len(total_links), 2)
|
||||
|
||||
89
pyth/tests/test_vectData.py
Normal file
89
pyth/tests/test_vectData.py
Normal file
@@ -0,0 +1,89 @@
|
||||
import unittest
|
||||
import numpy as np
|
||||
import psycopg2
|
||||
import os
|
||||
from vectData import calculate_cosine_similarity, is_similar_data, insert_data, get_data, create_db
|
||||
|
||||
class TestIntegration(unittest.TestCase):
|
||||
host = os.getenv("DB_HOST")
|
||||
port = os.getenv("DB_PORT")
|
||||
user = os.getenv("DB_USER")
|
||||
password = os.getenv("DB_PASSWORD")
|
||||
dbname = os.getenv("DB_NAME")
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.host = os.getenv("DB_HOST")
|
||||
cls.port = os.getenv("DB_PORT")
|
||||
cls.user = os.getenv("DB_USER")
|
||||
cls.password = os.getenv("DB_PASSWORD")
|
||||
cls.dbname = os.getenv("DB_NAME")
|
||||
|
||||
cls.conn = psycopg2.connect(
|
||||
host=cls.host,
|
||||
port=cls.port,
|
||||
user=cls.user,
|
||||
password=cls.password,
|
||||
dbname=cls.dbname
|
||||
)
|
||||
create_db(cls.conn)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
cls.conn.close()
|
||||
|
||||
def setUp(self):
|
||||
if self.conn.closed:
|
||||
self.conn = psycopg2.connect(
|
||||
host=self.host,
|
||||
port=self.port,
|
||||
user=self.user,
|
||||
password=self.password,
|
||||
dbname=self.dbname
|
||||
)
|
||||
self.cursor = self.conn.cursor()
|
||||
|
||||
def tearDown(self):
|
||||
if not self.cursor.closed:
|
||||
self.cursor.close()
|
||||
|
||||
if not self.conn.closed:
|
||||
self.conn.close()
|
||||
|
||||
def test_insert_and_retrieve_data(self):
|
||||
title = 'test_title'
|
||||
text = 'test_text'
|
||||
link = 'test_link'
|
||||
embedding = np.arange(1, 1537)
|
||||
|
||||
insert_data(title, text, link, embedding)
|
||||
|
||||
data = get_data()
|
||||
|
||||
self.assertEqual(data, [(title, text, link)])
|
||||
|
||||
def test_is_similar_data_integration(self):
|
||||
title = 'test_title'
|
||||
text = 'test_text'
|
||||
link = 'test_link'
|
||||
embedding = np.arange(1, 1537)
|
||||
|
||||
insert_data(title, text, link, embedding)
|
||||
|
||||
result = is_similar_data(title, text, link, embedding)
|
||||
self.assertTrue(result)
|
||||
|
||||
result = is_similar_data(title, text, link, embedding)
|
||||
self.assertTrue(result)
|
||||
|
||||
result = is_similar_data(title, text, link, embedding)
|
||||
self.assertTrue(result)
|
||||
|
||||
def test_create_db_integration(self):
|
||||
cursor = self.conn.cursor()
|
||||
cursor.execute("SELECT * FROM information_schema.tables WHERE table_name = 'vectorsvevijesti'")
|
||||
table_exist = bool(cursor.fetchone())
|
||||
self.assertTrue(table_exist)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user