diff --git a/pyth/.env b/pyth/.env new file mode 100644 index 0000000..c213e8f --- /dev/null +++ b/pyth/.env @@ -0,0 +1,7 @@ +OPENAI_API_KEY = "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" + +DB_HOST =localhost +DB_PORT =5432 +DB_USER =postgres +DB_PASSWORD =salmonela pljusti 221 hamo +DB_NAME =svevijestiweb \ No newline at end of file diff --git a/pyth/.gitlab-ci.yml b/pyth/.gitlab-ci.yml new file mode 100644 index 0000000..8cd8989 --- /dev/null +++ b/pyth/.gitlab-ci.yml @@ -0,0 +1,21 @@ +stages: + - test + +variables: + +before_script: + - pip install -r requirements.txt + +test_file1: + stage: test + script: + - python -m pytest tests/test_scrapingsingle.py + only: + - master + +test_file2: + stage: test + script: + - python -m pytest tests/test_vectData.py + only: + - master diff --git a/pyth/__pycache__/articles.cpython-310.pyc b/pyth/__pycache__/articles.cpython-310.pyc new file mode 100644 index 0000000..40e56eb Binary files /dev/null and b/pyth/__pycache__/articles.cpython-310.pyc differ diff --git a/pyth/__pycache__/scrapingsingle.cpython-310.pyc b/pyth/__pycache__/scrapingsingle.cpython-310.pyc new file mode 100644 index 0000000..b39ce5c Binary files /dev/null and b/pyth/__pycache__/scrapingsingle.cpython-310.pyc differ diff --git a/pyth/__pycache__/vectData.cpython-310.pyc b/pyth/__pycache__/vectData.cpython-310.pyc new file mode 100644 index 0000000..e806a8a Binary files /dev/null and b/pyth/__pycache__/vectData.cpython-310.pyc differ diff --git a/pyth/articles.py b/pyth/articles.py new file mode 100644 index 0000000..b5ae49f --- /dev/null +++ b/pyth/articles.py @@ -0,0 +1,241 @@ +import psycopg2 +import numpy as np +from dotenv import load_dotenv +import os +from openai import OpenAI +from langchain.embeddings import OpenAIEmbeddings +from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings +from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens +import json +from json_repair import repair_json + +load_dotenv() + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +client = OpenAI() +embeddings = OpenAIEmbeddings() + +print(f"Checking for similar!") + +def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95): + try: + titles, links, embeddings = get_titles_links_embeddings() + + processed_articles = set() + grouped_similar_articles = [] + + for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)): + if (title1, link1) not in processed_articles: + processed_articles.add((title1, link1)) + group = [(title1, link1)] + + for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)): + if i != j and (title2, link2) not in processed_articles: + similarity = calculate_cosine_similarity(embedding1, embedding2) + + if similarity > threshold: + processed_articles.add((title2, link2)) + group.append((title2, link2)) + + grouped_similar_articles.append(group) + + return grouped_similar_articles + + except psycopg2.Error as e: + print(f"Error: {e}") + return [] + +def processing_similar(): + grouped_similar_articles_result = find_and_group_similar_articles() + + if grouped_similar_articles_result: + for group in grouped_similar_articles_result: + articles = [] + + if len(group) > 1: + for article_tuple in group: + if len(article_tuple) >= 2: + title, link = article_tuple[:2] + article = [title, link] + articles.append(article) + l = len(articles) + + if l == 2: + a_one = articles[0][0] + a_two = articles[1][0] + + get_one = get_specific_data(a_one) + get_two = get_specific_data(a_two) + + text1 = get_one[0][1] + text2 = get_two[0][1] + link1 = get_one[0][2] + link2 = get_two[0][2] + if link1 != link2: + link = f"{link1}, {link2}" + else: + link = link1 + + ftoks = num_tokens_from_string(text1) + stoks = num_tokens_from_string(text2) + tokens = ftoks + stoks + + similar_d = f"C: {a_one}, {a_two}" + + modify_similar_data(similar_d, a_one) + preparing_articles(False, a_one) + + modify_similar_data(similar_d, a_two) + preparing_articles(False, a_two) + + if tokens > 2000: + combined_text = f"{text1} {text2}" + combined_text = slice_text_at_2k_tokens(combined_text) + user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field" + else: + user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." + + if l == 3: + a_one = articles[0][0] + a_two = articles[1][0] + a_three = articles[2][0] + + get_one = get_specific_data(a_one) + get_two = get_specific_data(a_two) + get_three = get_specific_data(a_three) + + text1 = get_one[0][1] + text2 = get_two[0][1] + text3 = get_three[0][1] + link1 = get_one[0][2] + link2 = get_two[0][2] + link3 = get_three[0][2] + if link1 != link2: + if link2 != link3: + link = f"{link1}, {link2}, {link3}" + else: + link = f"{link1}, {link2}" + else: + if link2 != link3: + link = f"{link1}, {link3}" + else: + link = link1 + ftoks = num_tokens_from_string(text1) + stoks = num_tokens_from_string(text2) + ttoks = num_tokens_from_string(text3) + tokens = ftoks + stoks + ttoks + + similar_d = f"C: {a_one}, {a_two}, {a_three}" + modify_similar_data(similar_d, a_one) + preparing_articles(False, a_one) + + modify_similar_data(similar_d, a_two) + preparing_articles(False, a_two) + + modify_similar_data(similar_d, a_three) + preparing_articles(False, a_three) + + if tokens > 2000: + combined_text = f"{text1} {text2} {text3}" + combined_text = slice_text_at_2k_tokens(combined_text) + user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field" + else: + user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." + if l == 4: + a_one = articles[0][0] + a_two = articles[1][0] + a_three = articles[2][0] + a_four = articles[3][0] + + get_one = get_specific_data(a_one) + get_two = get_specific_data(a_two) + get_three = get_specific_data(a_three) + get_four = get_specific_data(a_four) + + text1 = get_one[0][1] + text2 = get_two[0][1] + text3 = get_three[0][1] + text4 = get_four[0][1] + link1 = get_one[0][2] + link2 = get_two[0][2] + link3 = get_three[0][2] + link4 = get_four[0][2] + + if link1 != link2: + if link2 != link3: + if link3 != link4: + link = f"{link1}, {link2}, {link3}, {link4}" + else: + link = f"{link1}, {link2}, {link3}" + else: + if link3 != link4: + link = f"{link1}, {link2}, {link4}" + else: + link = f"{link1}, {link2}" + else: + if link2 != link3: + if link3 != link4: + link = f"{link1}, {link3}, {link4}" + else: + link = f"{link1}, {link3}" + else: + if link3 != link4: + link = f"{link1}, {link4}" + else: + link = link1 + + ftoks = num_tokens_from_string(text1) + stoks = num_tokens_from_string(text2) + ttoks = num_tokens_from_string(text3) + frtoks = num_tokens_from_string(text4) + + tokens = ftoks + stoks + ttoks + frtoks + + similar_d = f"C: {a_one}, {a_two}, {a_three}, {a_four}" + modify_similar_data(similar_d, a_one) + preparing_articles(False, a_one) + + modify_similar_data(similar_d, a_two) + preparing_articles(False, a_two) + + modify_similar_data(similar_d, a_three) + preparing_articles(False, a_three) + + modify_similar_data(similar_d, a_four) + preparing_articles(False, a_four) + + if tokens > 2000: + combined_text = f"{text1} {text2} {text3} {text4}" + combined_text = slice_text_at_2k_tokens(combined_text) + user_message = rf"Here is text {combined_text}, combined from 4 sources, filter text, and make news content, return as JSON only with a single 'content' field" + else: + user_message = rf"Here are 4 texts {text1} {text2} {text3} and {text4}, combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single 'content' field." + try: + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "Data analytic, Journalist and News reporter"}, + {"role": "user", "content": user_message} + ]) + generated_text = completion.choices[0].message.content + + generated_text = repair_json(generated_text) + + response_data = json.loads(generated_text) + title = a_one + text = response_data["content"] + vector = embeddings.embed_query(generated_text) + + insert_data(title, text, link, vector, similar_d) + print(f"Inserting combined: {title}") + + except Exception as e: + print(f"Error: {e}") + print(a_one) + continue + else: + print("Done!.") + else: + print("No similar articles found.") +if __name__=="__main__": + processing_similar() diff --git a/pyth/requirements.txt b/pyth/requirements.txt new file mode 100644 index 0000000..983c61e --- /dev/null +++ b/pyth/requirements.txt @@ -0,0 +1,141 @@ +aiohttp==3.9.1 +aiosignal==1.3.1 +annotated-types==0.6.0 +anyio==4.2.0 +apturl==0.5.2 +async-timeout==4.0.3 +attrs==23.1.0 +beautifulsoup4==4.12.2 +blinker==1.7.0 +blis==0.7.11 +Brlapi==0.8.3 +catalogue==2.0.10 +certifi==2020.6.20 +chardet==4.0.0 +charset-normalizer==3.3.2 +click==8.1.7 +cloudpathlib==0.16.0 +colorama==0.4.4 +command-not-found==0.3 +confection==0.1.4 +cryptography==3.4.8 +cupshelpers==1.0 +cymem==2.0.8 +dataclasses-json==0.6.3 +DateTime==5.4 +dbus-python==1.2.18 +decorator==4.4.2 +defer==1.0.6 +distro==1.7.0 +distro-info==1.1+ubuntu0.1 +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl +exceptiongroup==1.2.0 +Flask==3.0.0 +Flask-Cors==4.0.0 +frozenlist==1.4.1 +greenlet==1.1.2 +gyp==0.1 +h11==0.14.0 +httpcore==1.0.2 +httplib2==0.20.2 +httpx==0.25.2 +idna==3.3 +importlib-metadata==4.6.4 +itsdangerous==2.1.2 +jeepney==0.7.1 +Jinja2==3.1.2 +joblib==1.3.2 +jsonpatch==1.33 +jsonpointer==2.4 +keyring==23.5.0 +langchain==0.0.352 +langchain-community==0.0.6 +langchain-core==0.1.3 +langcodes==3.3.0 +langsmith==0.0.74 +language-selector==0.1 +launchpadlib==1.10.16 +lazr.restfulclient==0.14.4 +lazr.uri==1.0.6 +louis==3.20.0 +macaroonbakery==1.3.1 +MarkupSafe==2.1.3 +marshmallow==3.20.1 +more-itertools==8.10.0 +multidict==6.0.4 +murmurhash==1.0.10 +mypy-extensions==1.0.0 +netifaces==0.11.0 +numpy==1.26.2 +oauthlib==3.2.0 +olefile==0.46 +openai==1.5.0 +packaging==23.2 +pbr==5.8.0 +pexpect==4.8.0 +pgvector==0.2.4 +Pillow==9.0.1 +preshed==3.0.9 +protobuf==3.12.4 +psycopg==3.1.15 +psycopg2-binary==2.9.9 +ptyprocess==0.7.0 +pycairo==1.20.1 +pycups==2.0.1 +pydantic==2.5.2 +pydantic_core==2.14.5 +PyGObject==3.42.1 +PyJWT==2.3.0 +pymacaroons==0.13.0 +PyNaCl==1.5.0 +pyparsing==2.4.7 +pyRFC3339==1.1 +python-apt==2.4.0+ubuntu2 +python-dateutil==2.8.1 +python-debian==0.1.43+ubuntu1.1 +python-dotenv==1.0.0 +pytz==2022.1 +pyxdg==0.27 +PyYAML==5.4.1 +regex==2023.10.3 +reportlab==3.6.8 +requests==2.31.0 +scikit-learn==1.3.2 +scipy==1.11.4 +SecretStorage==3.3.1 +six==1.16.0 +slugify==0.0.1 +smart-open==6.4.0 +sniffio==1.3.0 +soupsieve==2.5 +spacy==3.7.2 +spacy-legacy==3.0.12 +spacy-loggers==1.0.5 +SQLAlchemy==1.4.31 +sqlalchemy-migrate==0.13.0 +sqlparse==0.4.2 +srsly==2.4.8 +systemd-python==234 +Tempita==0.5.2 +tenacity==8.2.3 +thinc==8.2.2 +threadpoolctl==3.2.0 +tiktoken==0.5.2 +tqdm==4.66.1 +typer==0.9.0 +typing-inspect==0.9.0 +typing_extensions==4.9.0 +ubuntu-advantage-tools==8001 +ubuntu-drivers-common==0.0.0 +ufw==0.36.1 +unattended-upgrades==0.1 +urllib3==1.26.5 +wadllib==1.3.6 +wasabi==1.1.2 +weasel==0.3.4 +Werkzeug==3.0.1 +xdg==5 +xkit==0.0.0 +yarl==1.9.4 +zipp==1.0.0 +zope.interface==6.1 diff --git a/pyth/scrapingsingle.py b/pyth/scrapingsingle.py index e03be09..672ba87 100644 --- a/pyth/scrapingsingle.py +++ b/pyth/scrapingsingle.py @@ -1,21 +1,66 @@ from bs4 import BeautifulSoup import requests from urllib.parse import urljoin -from openai import OpenAI +from openai import OpenAI import os from langchain.embeddings import OpenAIEmbeddings -from langchain.vectorstores.pgvector import PGVector -from vectData import insert_data ,is_similar_data +from vectData import (insert_data ,is_similar_data ,get_all_links,cleansing ) import json +from dotenv import load_dotenv +import tiktoken +from json_repair import repair_json +load_dotenv() +cleansing() + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") -os.environ["OPENAI_API_KEY"] = "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" client = OpenAI() embeddings = OpenAIEmbeddings() dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info'] headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'} +def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int: + encoding = tiktoken.encoding_for_model(model) + return len(encoding.encode(string)) + +def slice_text_at_2k_tokens(text): + encoding_name = "gpt-3.5-turbo" + max_tokens = 1950 + encoding = tiktoken.encoding_for_model(encoding_name) + tokens = encoding.encode(text) + if len(tokens) <= max_tokens: + return [text] + sliced_tokens = tokens[:max_tokens] + sliced_text = encoding.decode(sliced_tokens) + return sliced_text + +def slice_title_if_needed(text): + encoding_name = "gpt-3.5-turbo" + max_tokens = 100 + encoding = tiktoken.encoding_for_model(encoding_name) + tokens = encoding.encode(text) + if len(tokens) <= max_tokens: + return [text] + sliced_tokens = tokens[:max_tokens] + sliced_text = encoding.decode(sliced_tokens) + return sliced_text + +def replace_with_spaces(text): + allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 " + cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text) + return cleaned_text + +def fix_links(links_set): + modified_links = set() + for link in links_set: + if "www" in link: + modified_link = link.replace("www.", "") + modified_links.add(modified_link) + else: + modified_links.add(link) + return modified_links total_links = set() collected_news = set() @@ -42,10 +87,17 @@ for dlink in dlinks: temp_links = get_article_links(dlink, already_checked) if temp_links: total_links.update(temp_links) - final_links = {item for item in total_links if item} -for link in final_links: +db_links = set(get_all_links()) +new_links = final_links - db_links +final_links = new_links +final_links = set(final_links) +final_links = fix_links(final_links) + +if __name__ == '__main__': + + for link in final_links: response = requests.get(link,headers) soup = BeautifulSoup(response.text, 'html.parser') @@ -54,34 +106,41 @@ for link in final_links: texts = soup.find_all(['p']) text_text = ' '.join([text.get_text(strip=True) for text in texts]) + + text_text = text_text + title_text = title_text + title_text = replace_with_spaces(title_text) + + text_text = slice_text_at_2k_tokens(text_text) + text_text = replace_with_spaces(str(text_text)) + + ttk = num_tokens_from_string(text_text) + + if ttk > 1900: + title_text = slice_title_if_needed(title_text) try: completion = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "Data analytic, Journalist and News reporter"}, - {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."} + {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data make sure that its valid JSON object with 'title' field and 'content' field."} ] ) generated_text = completion.choices[0].message.content + generated_text = repair_json(generated_text) + response_data = json.loads(generated_text) - title = response_data["title"] text = response_data["content"] - - print("*********************************") - print(f"Title: {title}") - print("---------------------------------") - print(f"Content : {text}") - print("*********************************") - - vector = embeddings.embed_query(generated_text) - - if not is_similar_data(title, text, link, vector, threshold=0.9): - insert_data(title, text, link, vector) + if not is_similar_data(title, text, link, vector, threshold=0.98): + similar_d = "NO" + insert_data(title, text, link, vector,similar_d) + except Exception as e: print(f"Error in completion: {e}") continue + diff --git a/pyth/templates/index.html b/pyth/templates/index.html new file mode 100644 index 0000000..c9e51c1 --- /dev/null +++ b/pyth/templates/index.html @@ -0,0 +1,22 @@ + + + + + + Test Pyth + + +
+
+

Test Title 1

+

Test Text 1

+ First +
+
+

Test Title 2

+

Test Text 2

+ Second +
+
+ + \ No newline at end of file diff --git a/pyth/templates/one.html b/pyth/templates/one.html new file mode 100644 index 0000000..bcba718 --- /dev/null +++ b/pyth/templates/one.html @@ -0,0 +1,12 @@ + + + + + + Article + + +

Test Title

+

Test Text

+ + \ No newline at end of file diff --git a/pyth/templates/two.html b/pyth/templates/two.html new file mode 100644 index 0000000..bcba718 --- /dev/null +++ b/pyth/templates/two.html @@ -0,0 +1,12 @@ + + + + + + Article + + +

Test Title

+

Test Text

+ + \ No newline at end of file diff --git a/pyth/tests/__pycache__/test_scrapingsingle.cpython-310.pyc b/pyth/tests/__pycache__/test_scrapingsingle.cpython-310.pyc new file mode 100644 index 0000000..ab3b6ce Binary files /dev/null and b/pyth/tests/__pycache__/test_scrapingsingle.cpython-310.pyc differ diff --git a/pyth/tests/__pycache__/test_vectData.cpython-310.pyc b/pyth/tests/__pycache__/test_vectData.cpython-310.pyc new file mode 100644 index 0000000..eb021b5 Binary files /dev/null and b/pyth/tests/__pycache__/test_vectData.cpython-310.pyc differ diff --git a/pyth/tests/test_scrapingsingle.py b/pyth/tests/test_scrapingsingle.py new file mode 100644 index 0000000..5afcfda --- /dev/null +++ b/pyth/tests/test_scrapingsingle.py @@ -0,0 +1,60 @@ +import unittest +from unittest.mock import patch +import requests +from bs4 import BeautifulSoup +from langchain.embeddings import OpenAIEmbeddings +from langchain.vectorstores.pgvector import PGVector +from openai import OpenAI +import json +from dotenv import load_dotenv +from scrapingsingle import get_article_links, insert_data, is_similar_data +import os + +load_dotenv() + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +client = OpenAI() +embeddings = OpenAIEmbeddings() + + +already_checked = set() +total_links = set() +collected_news = set() +dlinks = 'http://127.0.0.1:5000/' + +class TestIntegration(unittest.TestCase): + + + def test_integration(self): + link = get_article_links(dlinks,already_checked) + self.assertEqual(len(already_checked), 2) + + for link in total_links: + response = requests.get(link) + soup = BeautifulSoup(response.text, 'html.parser') + + titles = soup.find_all(['h2', 'h1', 'h3']) + title_text = ' '.join([title.get_text(strip=True) for title in titles]) + + texts = soup.find_all(['p']) + text_text = ' '.join([text.get_text(strip=True) for text in texts]) + + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "Data analytic, Journalist and News reporter"}, + {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."} + ] + ) + generated_text = completion.choices[0].message.content + + response_data = json.loads(generated_text) + title = response_data["title"] + text = response_data["content"] + + vector = embeddings.embed_query(generated_text) + + self.assertIn("Test Title", title) + self.assertIn("Test Text", text) + self.assertEqual(len(total_links), 2) + diff --git a/pyth/tests/test_vectData.py b/pyth/tests/test_vectData.py new file mode 100644 index 0000000..99d4dd6 --- /dev/null +++ b/pyth/tests/test_vectData.py @@ -0,0 +1,89 @@ +import unittest +import numpy as np +import psycopg2 +import os +from vectData import calculate_cosine_similarity, is_similar_data, insert_data, get_data, create_db + +class TestIntegration(unittest.TestCase): + host = os.getenv("DB_HOST") + port = os.getenv("DB_PORT") + user = os.getenv("DB_USER") + password = os.getenv("DB_PASSWORD") + dbname = os.getenv("DB_NAME") + + @classmethod + def setUpClass(cls): + cls.host = os.getenv("DB_HOST") + cls.port = os.getenv("DB_PORT") + cls.user = os.getenv("DB_USER") + cls.password = os.getenv("DB_PASSWORD") + cls.dbname = os.getenv("DB_NAME") + + cls.conn = psycopg2.connect( + host=cls.host, + port=cls.port, + user=cls.user, + password=cls.password, + dbname=cls.dbname + ) + create_db(cls.conn) + + @classmethod + def tearDownClass(cls): + cls.conn.close() + + def setUp(self): + if self.conn.closed: + self.conn = psycopg2.connect( + host=self.host, + port=self.port, + user=self.user, + password=self.password, + dbname=self.dbname + ) + self.cursor = self.conn.cursor() + + def tearDown(self): + if not self.cursor.closed: + self.cursor.close() + + if not self.conn.closed: + self.conn.close() + + def test_insert_and_retrieve_data(self): + title = 'test_title' + text = 'test_text' + link = 'test_link' + embedding = np.arange(1, 1537) + + insert_data(title, text, link, embedding) + + data = get_data() + + self.assertEqual(data, [(title, text, link)]) + + def test_is_similar_data_integration(self): + title = 'test_title' + text = 'test_text' + link = 'test_link' + embedding = np.arange(1, 1537) + + insert_data(title, text, link, embedding) + + result = is_similar_data(title, text, link, embedding) + self.assertTrue(result) + + result = is_similar_data(title, text, link, embedding) + self.assertTrue(result) + + result = is_similar_data(title, text, link, embedding) + self.assertTrue(result) + + def test_create_db_integration(self): + cursor = self.conn.cursor() + cursor.execute("SELECT * FROM information_schema.tables WHERE table_name = 'vectorsvevijesti'") + table_exist = bool(cursor.fetchone()) + self.assertTrue(table_exist) + +if __name__ == '__main__': + unittest.main() diff --git a/pyth/vectData.py b/pyth/vectData.py index dd1e2d7..e3deda7 100644 --- a/pyth/vectData.py +++ b/pyth/vectData.py @@ -3,113 +3,193 @@ from psycopg2 import sql from pgvector.psycopg2 import register_vector from sklearn.metrics.pairwise import cosine_similarity import numpy as np +import os +from dotenv import load_dotenv +from datetime import datetime ,timedelta -host = 'localhost' -port = '5432' -user = 'postgres' -password = 'salmonela pljusti 221 hamo' -dbname = 'vector_svw' +load_dotenv() -def calculate_cosine_similarity(v1, v2): - v1_normalized = v1 / np.linalg.norm(v1) - v2_normalized = v2 / np.linalg.norm(v2) +host = os.getenv("DB_HOST") +port = os.getenv("DB_PORT") +user = os.getenv("DB_USER") +password = os.getenv("DB_PASSWORD") +dbname = os.getenv("DB_NAME") - similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0] - return similarity - -def is_similar_data(title, text, link, embedding, threshold=0.9): - conn = psycopg2.connect( +conn = psycopg2.connect( host=host, port=port, user=user, password=password, dbname=dbname ) - cursor = conn.cursor() - cursor.execute('SELECT title ,embedding FROM vectorsvevijesti;') +def calculate_cosine_similarity(v1, v2): + v1_normalized = v1 / np.linalg.norm(v1) + v2_normalized = v2 / np.linalg.norm(v2) + similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0] + return similarity + +def parse_embedding_string(embedding_str): + if isinstance(embedding_str, str): + numbers = [float(num) for num in embedding_str[1:-1].split(',')] + return np.array(numbers) + elif isinstance(embedding_str, np.ndarray): + return embedding_str + else: + raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.") + +def is_similar_data(title, text, link, embedding, threshold=0.98): + cursor = conn.cursor() + cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;') existing_embeddings = cursor.fetchall() for existing_embedding_tuple in existing_embeddings: existing_title = existing_embedding_tuple[0] existing_embedding = np.array(existing_embedding_tuple[1]).flatten() + existing_link = existing_embedding_tuple[2] similarity = calculate_cosine_similarity(existing_embedding, embedding) if similarity > threshold: - print(f"Similar data found: \n #{title} \n #{existing_title}") - cursor.close() - conn.close() - return True + if link != existing_link: + similar_d = existing_title + insert_data(title,text,link,embedding,similar_d) + print(f"Similar data found: \n #{title} \n #{existing_title}") + print(f"Inserting: #{title}") + similar_d = "NO" + cursor.close() + return True + else: + print(f"Same article of same source!") + cursor.close() + return True print(f"Inserting: #{title}") cursor.close() - conn.close() return False -def insert_data(title, text, link, embedding): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) +def get_similar(): cursor = conn.cursor() - - cursor.execute(''' - INSERT INTO vectorsvevijesti (title, text, link, embedding) - VALUES (%s, %s, %s, %s); - ''', (title, text, link, embedding)) - - conn.commit() - + query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')''' + cursor.execute(query) + similar_data = cursor.fetchall() + cursor.close() + return similar_data + +def get_titles_links_embeddings(): + cursor = conn.cursor() + cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;') + data = cursor.fetchall() + cursor.close() + + titles = [row[0] for row in data] + links = [row[1] for row in data] + embeddings = [parse_embedding_string(row[2]) for row in data] + + return titles, links, embeddings + + +def insert_data(title, text, link, embedding, similar_d): + c_time = datetime.now() + cursor = conn.cursor() + cursor.execute(''' + INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready) + VALUES (%s, %s, %s, %s, %s ,%s ,%s); + ''', (title, text, link, embedding , similar_d, c_time, True)) + conn.commit() cursor.close() - conn.close() def get_data(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) + cursor = conn.cursor() query = '''SELECT title,text,link FROM vectorsvevijesti;''' - cursor.execute(query) data = cursor.fetchall() cursor.close() - conn.close() - return data -def create_db(): - conn = psycopg2.connect( - host=host, - port=port, - user=user, - password=password, - dbname=dbname - ) +def get_ready_data(): cursor = conn.cursor() + query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;''' + cursor.execute(query, ('True',)) + data = cursor.fetchall() + cursor.close() + return data +def get_source_data(): + cursor = conn.cursor() + query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;''' + cursor.execute(query, ('False',)) + data = cursor.fetchall() + cursor.close() + return data + + +def modify_similar_data(new_value ,title): + cursor = conn.cursor() + query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s ''' + cursor.execute(query, (new_value, title)) + conn.commit() + + +def preparing_articles(new_value ,title): + cursor = conn.cursor() + query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s ''' + cursor.execute(query, (new_value, title)) + conn.commit() + +def get_specific_data(title): + cursor = conn.cursor() + query = '''SELECT title, text, link, similar_d, embedding, ready FROM vectorsvevijesti WHERE title = %s''' + cursor.execute(query, (title,)) + specific_post = cursor.fetchall() + cursor.close() + return specific_post + + +def get_all_links(): + cursor = conn.cursor() + query = '''SELECT link FROM vectorsvevijesti''' + cursor.execute(query) + db_links = {link[0] for link in cursor.fetchall()} + cursor.close() + return db_links + +def delete_specific(title): + cursor = conn.cursor() + query = '''DELETE FROM vectorsvevijesti WHERE title = %s''' + cursor.execute(query,(title,)) + cursor.close() + +def cleansing(): + day_long = datetime.now() - timedelta(days=1) + cursor = conn.cursor() + query = '''DELETE FROM vectorsvevijesti WHERE time < %s''' + cursor.execute(query,(day_long,)) + conn.commit() + cursor.close() + +def drop_table(): + cursor = conn.cursor() + query = '''DROP TABLE IF EXISTS vectorsvevijesti;''' + cursor.execute(query) + conn.commit() + cursor.close() + +def create_db(): + cursor = conn.cursor() cursor.execute("CREATE EXTENSION IF NOT EXISTS vector") - register_vector(conn) - - cursor.execute("DROP TABLE IF EXISTS vectorsvevijesti;") - cursor.execute(''' - CREATE TABLE vectorsvevijesti ( + CREATE TABLE IF NOT EXISTS vectorsvevijesti ( id bigserial PRIMARY KEY, title VARCHAR, text VARCHAR, link VARCHAR, - embedding vector(1536) + embedding vector(1536), + similar_d VARCHAR, + time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + ready BOOLEAN ); ''') - conn.commit() cursor.close() - conn.close() -create_db() \ No newline at end of file +create_db() diff --git a/pyth/web-server.py b/pyth/web-server.py new file mode 100644 index 0000000..ed1dc44 --- /dev/null +++ b/pyth/web-server.py @@ -0,0 +1,29 @@ +from flask import Flask , render_template , jsonify +from vectData import get_ready_data +from flask_cors import CORS + + +app = Flask(__name__) + +CORS(app) + +@app.route('/') +def index() : + return render_template("index.html") + + +@app.route('/article/one') +def articleone(): + return render_template("one.html") + + +@app.route('/article/two') +def articletwo(): + return render_template("two.html") + +@app.route('/data/get/news', methods=['GET']) +def takenews(): + data = get_ready_data() + return jsonify(data) + +app.run(debug=True) \ No newline at end of file