Test Title 1
+Test Text 1
+ First +diff --git a/pyth/.env b/pyth/.env new file mode 100644 index 0000000..c213e8f --- /dev/null +++ b/pyth/.env @@ -0,0 +1,7 @@ +OPENAI_API_KEY = "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" + +DB_HOST =localhost +DB_PORT =5432 +DB_USER =postgres +DB_PASSWORD =salmonela pljusti 221 hamo +DB_NAME =svevijestiweb \ No newline at end of file diff --git a/pyth/.gitlab-ci.yml b/pyth/.gitlab-ci.yml new file mode 100644 index 0000000..8cd8989 --- /dev/null +++ b/pyth/.gitlab-ci.yml @@ -0,0 +1,21 @@ +stages: + - test + +variables: + +before_script: + - pip install -r requirements.txt + +test_file1: + stage: test + script: + - python -m pytest tests/test_scrapingsingle.py + only: + - master + +test_file2: + stage: test + script: + - python -m pytest tests/test_vectData.py + only: + - master diff --git a/pyth/__pycache__/scrapingsingle.cpython-310.pyc b/pyth/__pycache__/scrapingsingle.cpython-310.pyc new file mode 100644 index 0000000..38af9db Binary files /dev/null and b/pyth/__pycache__/scrapingsingle.cpython-310.pyc differ diff --git a/pyth/__pycache__/vectData.cpython-310.pyc b/pyth/__pycache__/vectData.cpython-310.pyc new file mode 100644 index 0000000..4104298 Binary files /dev/null and b/pyth/__pycache__/vectData.cpython-310.pyc differ diff --git a/pyth/requirements.txt b/pyth/requirements.txt new file mode 100644 index 0000000..983c61e --- /dev/null +++ b/pyth/requirements.txt @@ -0,0 +1,141 @@ +aiohttp==3.9.1 +aiosignal==1.3.1 +annotated-types==0.6.0 +anyio==4.2.0 +apturl==0.5.2 +async-timeout==4.0.3 +attrs==23.1.0 +beautifulsoup4==4.12.2 +blinker==1.7.0 +blis==0.7.11 +Brlapi==0.8.3 +catalogue==2.0.10 +certifi==2020.6.20 +chardet==4.0.0 +charset-normalizer==3.3.2 +click==8.1.7 +cloudpathlib==0.16.0 +colorama==0.4.4 +command-not-found==0.3 +confection==0.1.4 +cryptography==3.4.8 +cupshelpers==1.0 +cymem==2.0.8 +dataclasses-json==0.6.3 +DateTime==5.4 +dbus-python==1.2.18 +decorator==4.4.2 +defer==1.0.6 +distro==1.7.0 +distro-info==1.1+ubuntu0.1 +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl +exceptiongroup==1.2.0 +Flask==3.0.0 +Flask-Cors==4.0.0 +frozenlist==1.4.1 +greenlet==1.1.2 +gyp==0.1 +h11==0.14.0 +httpcore==1.0.2 +httplib2==0.20.2 +httpx==0.25.2 +idna==3.3 +importlib-metadata==4.6.4 +itsdangerous==2.1.2 +jeepney==0.7.1 +Jinja2==3.1.2 +joblib==1.3.2 +jsonpatch==1.33 +jsonpointer==2.4 +keyring==23.5.0 +langchain==0.0.352 +langchain-community==0.0.6 +langchain-core==0.1.3 +langcodes==3.3.0 +langsmith==0.0.74 +language-selector==0.1 +launchpadlib==1.10.16 +lazr.restfulclient==0.14.4 +lazr.uri==1.0.6 +louis==3.20.0 +macaroonbakery==1.3.1 +MarkupSafe==2.1.3 +marshmallow==3.20.1 +more-itertools==8.10.0 +multidict==6.0.4 +murmurhash==1.0.10 +mypy-extensions==1.0.0 +netifaces==0.11.0 +numpy==1.26.2 +oauthlib==3.2.0 +olefile==0.46 +openai==1.5.0 +packaging==23.2 +pbr==5.8.0 +pexpect==4.8.0 +pgvector==0.2.4 +Pillow==9.0.1 +preshed==3.0.9 +protobuf==3.12.4 +psycopg==3.1.15 +psycopg2-binary==2.9.9 +ptyprocess==0.7.0 +pycairo==1.20.1 +pycups==2.0.1 +pydantic==2.5.2 +pydantic_core==2.14.5 +PyGObject==3.42.1 +PyJWT==2.3.0 +pymacaroons==0.13.0 +PyNaCl==1.5.0 +pyparsing==2.4.7 +pyRFC3339==1.1 +python-apt==2.4.0+ubuntu2 +python-dateutil==2.8.1 +python-debian==0.1.43+ubuntu1.1 +python-dotenv==1.0.0 +pytz==2022.1 +pyxdg==0.27 +PyYAML==5.4.1 +regex==2023.10.3 +reportlab==3.6.8 +requests==2.31.0 +scikit-learn==1.3.2 +scipy==1.11.4 +SecretStorage==3.3.1 +six==1.16.0 +slugify==0.0.1 +smart-open==6.4.0 +sniffio==1.3.0 +soupsieve==2.5 +spacy==3.7.2 +spacy-legacy==3.0.12 +spacy-loggers==1.0.5 +SQLAlchemy==1.4.31 +sqlalchemy-migrate==0.13.0 +sqlparse==0.4.2 +srsly==2.4.8 +systemd-python==234 +Tempita==0.5.2 +tenacity==8.2.3 +thinc==8.2.2 +threadpoolctl==3.2.0 +tiktoken==0.5.2 +tqdm==4.66.1 +typer==0.9.0 +typing-inspect==0.9.0 +typing_extensions==4.9.0 +ubuntu-advantage-tools==8001 +ubuntu-drivers-common==0.0.0 +ufw==0.36.1 +unattended-upgrades==0.1 +urllib3==1.26.5 +wadllib==1.3.6 +wasabi==1.1.2 +weasel==0.3.4 +Werkzeug==3.0.1 +xdg==5 +xkit==0.0.0 +yarl==1.9.4 +zipp==1.0.0 +zope.interface==6.1 diff --git a/pyth/scrapingsingle.py b/pyth/scrapingsingle.py index e03be09..8e65beb 100644 --- a/pyth/scrapingsingle.py +++ b/pyth/scrapingsingle.py @@ -1,15 +1,20 @@ from bs4 import BeautifulSoup import requests from urllib.parse import urljoin -from openai import OpenAI +from openai import OpenAI , APIError import os from langchain.embeddings import OpenAIEmbeddings -from langchain.vectorstores.pgvector import PGVector -from vectData import insert_data ,is_similar_data +from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data, delete_specific,get_all_links,cleansing ,modify_similar_data) import json +from dotenv import load_dotenv +import tiktoken -os.environ["OPENAI_API_KEY"] = "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" +load_dotenv() +cleansing() + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") + client = OpenAI() embeddings = OpenAIEmbeddings() @@ -17,9 +22,36 @@ dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info'] headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'} + +def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int: + encoding = tiktoken.encoding_for_model(model) + return len(encoding.encode(string)) + +def slice_text_at_2k_tokens(text): + encoding_name = "gpt-3.5-turbo" + max_tokens = 2000 + + encoding = tiktoken.encoding_for_model(encoding_name) + tokens = encoding.encode(text) + + if len(tokens) <= max_tokens: + return [text] + + sliced_tokens = tokens[:max_tokens] + sliced_text = encoding.decode(sliced_tokens) + + return sliced_text + + +def replace_with_spaces(text): + allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 " + cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text) + return cleaned_text + total_links = set() collected_news = set() + def get_article_links(url, already_checked): response = requests.get(url,headers) if response.status_code == 200: @@ -36,6 +68,8 @@ def get_article_links(url, already_checked): already_checked.add(link_value) return link_store + + already_checked = set() for dlink in dlinks: @@ -44,8 +78,17 @@ for dlink in dlinks: total_links.update(temp_links) final_links = {item for item in total_links if item} +i = 0 -for link in final_links: +db_links = set(get_all_links()) +new_links = final_links - db_links +final_links = new_links + + + +if __name__ == '__main__': + + for link in final_links: response = requests.get(link,headers) soup = BeautifulSoup(response.text, 'html.parser') @@ -54,6 +97,16 @@ for link in final_links: texts = soup.find_all(['p']) text_text = ' '.join([text.get_text(strip=True) for text in texts]) + + text_text = text_text + title_text = title_text + + title_text = replace_with_spaces(title_text) + + + print(f"Tokens usage: {num_tokens_from_string(text_text, 'gpt-3.5-turbo')}") + text_text = slice_text_at_2k_tokens(text_text) + text_text = replace_with_spaces(str(text_text)) try: completion = client.chat.completions.create( @@ -65,23 +118,130 @@ for link in final_links: ) generated_text = completion.choices[0].message.content + generated_text = generated_text + response_data = json.loads(generated_text) title = response_data["title"] text = response_data["content"] - print("*********************************") - print(f"Title: {title}") - print("---------------------------------") - print(f"Content : {text}") - print("*********************************") + #print("*********************************") + #print(f"Title: {title}") + #print("---------------------------------") + #print(f"Content : {text}") + #print("*********************************") vector = embeddings.embed_query(generated_text) - - if not is_similar_data(title, text, link, vector, threshold=0.9): - insert_data(title, text, link, vector) + if not is_similar_data(title, text, link, vector, threshold=0.98): + similar_d = "NO" + insert_data(title, text, link, vector,similar_d) + except Exception as e: print(f"Error in completion: {e}") continue + +def comb_similar(): + + print("Checking similar") + similar_article = get_similar() + + grouped_data = {} + + + for sa in similar_article: + if similar_article: + first_t = get_specific_data(sa[0]) + second_t = get_specific_data(sa[1]) + link_f = first_t[0][2] + link_s = second_t[0][2] + f_text = first_t[0][1] + s_text = second_t[0][1] + f_title = first_t[0][0] + s_title = second_t[0][0] + + if f_title in grouped_data: + grouped_data[f_title].append((f_text, link_f)) + else: + grouped_data[f_title] = [(f_text, link_f)] + + if s_title in grouped_data: + grouped_data[s_title].append((s_text, link_s)) + else: + grouped_data[s_title] = [(s_text, link_s)] + + for title, tuples in grouped_data.items(): + if len(tuples) == 3: + text1, link1 = tuples[0] + text2, link2 = tuples[1] + text3, link3 = tuples[2] + + t1check = num_tokens_from_string(text1) + t2check = num_tokens_from_string(text2) + t3check = num_tokens_from_string(text3) + slice_if_more = t1check,t2check,t3check + if slice_if_more < 2000: + combined_text = f"{text1}{text2}{text3}" + combined_text = slice_text_at_2k_tokens(combined_text) + user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with 'content' field" + link = f"{link1} {link2} {link3}" + + else: + user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field." + link = f"{link1} {link2} {link3}" + + else: + ftcheck = num_tokens_from_string(f_text) + stcheck = num_tokens_from_string(s_text) + fscomb = ftcheck + stcheck + if fscomb <2000: + combined_text = f"{f_text}{s_text}" + user_message = rf"Here is text {combined_text}, combined from 2 sources, filter text, and make news content, return as JSON only with 'content' field" + link = f"{link_f} {link_s}" + + else: + user_message = rf"Here are 2 texts {f_text} and {s_text}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field." + link = f"{link_f} {link_s}" + + try: + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "Data analytic, Journalist and News reporter"}, + {"role": "user", "content": user_message} + ] + ) + generated_text = completion.choices[0].message.content + generated_text = generated_text + + if similar_article: + if f_title == s_title: + print(f_title) + modify_similar_data(first_t,"SOURCE") + similar_article.remove(sa) + print("Modified") + else: + print(f"Second: {s_title}") + modify_similar_data(first_t,"SOURCE") + modify_similar_data(second_t,"SOURCE") + similar_article.remove(sa) + print("Modified") + else: + print("Similar list is empty") + + response_data = json.loads(generated_text) + title = f_title + text = response_data["content"] + + vector = embeddings.embed_query(generated_text) + + if not is_similar_data(title, text, link, vector, threshold=0.98): + similar_d = "NO" + insert_data(title, text, link, vector, similar_d) + + except Exception as e: + print(f"Error in completion: {e}") + continue + +comb_similar() \ No newline at end of file diff --git a/pyth/templates/index.html b/pyth/templates/index.html new file mode 100644 index 0000000..9b156d8 --- /dev/null +++ b/pyth/templates/index.html @@ -0,0 +1,23 @@ + + +
+ + +Test Text
+ + \ No newline at end of file diff --git a/pyth/templates/two.html b/pyth/templates/two.html new file mode 100644 index 0000000..bcba718 --- /dev/null +++ b/pyth/templates/two.html @@ -0,0 +1,12 @@ + + + + + +Test Text
+ + \ No newline at end of file diff --git a/pyth/tests/__pycache__/test_scrapingsingle.cpython-310.pyc b/pyth/tests/__pycache__/test_scrapingsingle.cpython-310.pyc new file mode 100644 index 0000000..ab3b6ce Binary files /dev/null and b/pyth/tests/__pycache__/test_scrapingsingle.cpython-310.pyc differ diff --git a/pyth/tests/__pycache__/test_vectData.cpython-310.pyc b/pyth/tests/__pycache__/test_vectData.cpython-310.pyc new file mode 100644 index 0000000..eb021b5 Binary files /dev/null and b/pyth/tests/__pycache__/test_vectData.cpython-310.pyc differ diff --git a/pyth/tests/test_scrapingsingle.py b/pyth/tests/test_scrapingsingle.py new file mode 100644 index 0000000..5afcfda --- /dev/null +++ b/pyth/tests/test_scrapingsingle.py @@ -0,0 +1,60 @@ +import unittest +from unittest.mock import patch +import requests +from bs4 import BeautifulSoup +from langchain.embeddings import OpenAIEmbeddings +from langchain.vectorstores.pgvector import PGVector +from openai import OpenAI +import json +from dotenv import load_dotenv +from scrapingsingle import get_article_links, insert_data, is_similar_data +import os + +load_dotenv() + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +client = OpenAI() +embeddings = OpenAIEmbeddings() + + +already_checked = set() +total_links = set() +collected_news = set() +dlinks = 'http://127.0.0.1:5000/' + +class TestIntegration(unittest.TestCase): + + + def test_integration(self): + link = get_article_links(dlinks,already_checked) + self.assertEqual(len(already_checked), 2) + + for link in total_links: + response = requests.get(link) + soup = BeautifulSoup(response.text, 'html.parser') + + titles = soup.find_all(['h2', 'h1', 'h3']) + title_text = ' '.join([title.get_text(strip=True) for title in titles]) + + texts = soup.find_all(['p']) + text_text = ' '.join([text.get_text(strip=True) for text in texts]) + + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "Data analytic, Journalist and News reporter"}, + {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."} + ] + ) + generated_text = completion.choices[0].message.content + + response_data = json.loads(generated_text) + title = response_data["title"] + text = response_data["content"] + + vector = embeddings.embed_query(generated_text) + + self.assertIn("Test Title", title) + self.assertIn("Test Text", text) + self.assertEqual(len(total_links), 2) + diff --git a/pyth/tests/test_vectData.py b/pyth/tests/test_vectData.py new file mode 100644 index 0000000..99d4dd6 --- /dev/null +++ b/pyth/tests/test_vectData.py @@ -0,0 +1,89 @@ +import unittest +import numpy as np +import psycopg2 +import os +from vectData import calculate_cosine_similarity, is_similar_data, insert_data, get_data, create_db + +class TestIntegration(unittest.TestCase): + host = os.getenv("DB_HOST") + port = os.getenv("DB_PORT") + user = os.getenv("DB_USER") + password = os.getenv("DB_PASSWORD") + dbname = os.getenv("DB_NAME") + + @classmethod + def setUpClass(cls): + cls.host = os.getenv("DB_HOST") + cls.port = os.getenv("DB_PORT") + cls.user = os.getenv("DB_USER") + cls.password = os.getenv("DB_PASSWORD") + cls.dbname = os.getenv("DB_NAME") + + cls.conn = psycopg2.connect( + host=cls.host, + port=cls.port, + user=cls.user, + password=cls.password, + dbname=cls.dbname + ) + create_db(cls.conn) + + @classmethod + def tearDownClass(cls): + cls.conn.close() + + def setUp(self): + if self.conn.closed: + self.conn = psycopg2.connect( + host=self.host, + port=self.port, + user=self.user, + password=self.password, + dbname=self.dbname + ) + self.cursor = self.conn.cursor() + + def tearDown(self): + if not self.cursor.closed: + self.cursor.close() + + if not self.conn.closed: + self.conn.close() + + def test_insert_and_retrieve_data(self): + title = 'test_title' + text = 'test_text' + link = 'test_link' + embedding = np.arange(1, 1537) + + insert_data(title, text, link, embedding) + + data = get_data() + + self.assertEqual(data, [(title, text, link)]) + + def test_is_similar_data_integration(self): + title = 'test_title' + text = 'test_text' + link = 'test_link' + embedding = np.arange(1, 1537) + + insert_data(title, text, link, embedding) + + result = is_similar_data(title, text, link, embedding) + self.assertTrue(result) + + result = is_similar_data(title, text, link, embedding) + self.assertTrue(result) + + result = is_similar_data(title, text, link, embedding) + self.assertTrue(result) + + def test_create_db_integration(self): + cursor = self.conn.cursor() + cursor.execute("SELECT * FROM information_schema.tables WHERE table_name = 'vectorsvevijesti'") + table_exist = bool(cursor.fetchone()) + self.assertTrue(table_exist) + +if __name__ == '__main__': + unittest.main() diff --git a/pyth/vectData.py b/pyth/vectData.py index dd1e2d7..e99883a 100644 --- a/pyth/vectData.py +++ b/pyth/vectData.py @@ -3,12 +3,26 @@ from psycopg2 import sql from pgvector.psycopg2 import register_vector from sklearn.metrics.pairwise import cosine_similarity import numpy as np +import os +from dotenv import load_dotenv +from datetime import datetime ,timedelta -host = 'localhost' -port = '5432' -user = 'postgres' -password = 'salmonela pljusti 221 hamo' -dbname = 'vector_svw' + +load_dotenv() + +host = os.getenv("DB_HOST") +port = os.getenv("DB_PORT") +user = os.getenv("DB_USER") +password = os.getenv("DB_PASSWORD") +dbname = os.getenv("DB_NAME") + +conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) def calculate_cosine_similarity(v1, v2): v1_normalized = v1 / np.linalg.norm(v1) @@ -17,7 +31,7 @@ def calculate_cosine_similarity(v1, v2): similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0] return similarity -def is_similar_data(title, text, link, embedding, threshold=0.9): +def is_similar_data(title, text, link, embedding, threshold=0.98): conn = psycopg2.connect( host=host, port=port, @@ -27,25 +41,33 @@ def is_similar_data(title, text, link, embedding, threshold=0.9): ) cursor = conn.cursor() - cursor.execute('SELECT title ,embedding FROM vectorsvevijesti;') + cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;') existing_embeddings = cursor.fetchall() for existing_embedding_tuple in existing_embeddings: existing_title = existing_embedding_tuple[0] existing_embedding = np.array(existing_embedding_tuple[1]).flatten() + existing_link = existing_embedding_tuple[2] similarity = calculate_cosine_similarity(existing_embedding, embedding) if similarity > threshold: - print(f"Similar data found: \n #{title} \n #{existing_title}") - cursor.close() - conn.close() - return True + if link != existing_link: + similar_d = existing_title + insert_data(title,text,link,embedding,similar_d) + print(f"Similar data found: \n #{title} \n #{existing_title}") + print(f"Inserting: #{title} \n") + similar_d = "NO" + cursor.close() + return True + else: + print(f"Same source of same article!") + cursor.close() + return True print(f"Inserting: #{title}") cursor.close() - conn.close() return False -def insert_data(title, text, link, embedding): +def get_similar(): conn = psycopg2.connect( host=host, port=port, @@ -53,17 +75,35 @@ def insert_data(title, text, link, embedding): password=password, dbname=dbname ) + cursor = conn.cursor() + query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')''' + cursor.execute(query) + similar_data = cursor.fetchall() + cursor.close() + return similar_data + + +def insert_data(title, text, link, embedding, similar_d): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + c_time = datetime.now() + + cursor = conn.cursor() cursor.execute(''' - INSERT INTO vectorsvevijesti (title, text, link, embedding) - VALUES (%s, %s, %s, %s); - ''', (title, text, link, embedding)) + INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time) + VALUES (%s, %s, %s, %s, %s ,%s); + ''', (title, text, link, embedding , similar_d, c_time)) conn.commit() cursor.close() - conn.close() def get_data(): conn = psycopg2.connect( @@ -79,11 +119,110 @@ def get_data(): cursor.execute(query) data = cursor.fetchall() cursor.close() - conn.close() - return data -def create_db(): +def modify_similar_data(new_value ,title): + + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + + query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s ''' + + cursor.execute(query, (new_value, title)) + + conn.commit() + +def get_specific_data(title): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + query = '''SELECT title, text, link, similar_d, embedding FROM vectorsvevijesti WHERE title = %s''' + cursor.execute(query, (title,)) + + specific_post = cursor.fetchall() + cursor.close() + return specific_post + +def get_all_links(): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + cursor = conn.cursor() + query = '''SELECT link FROM vectorsvevijesti''' + cursor.execute(query) + + db_links = {link[0] for link in cursor.fetchall()} + cursor.close() + return db_links + +def delete_specific(title): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + + cursor = conn.cursor() + query = '''DELETE FROM vectorsvevijesti WHERE title = %s''' + + cursor.execute(query,(title,)) + cursor.close() + +def cleansing(): + + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + + day_long = datetime.now() - timedelta(days=1) + + cursor = conn.cursor() + + query = '''DELETE FROM vectorsvevijesti WHERE time < %s''' + cursor.execute(query,(day_long,)) + + conn.commit() + cursor.close() + +def drop_table(): + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + + cursor = conn.cursor() + + query = '''DROP TABLE IF EXISTS vectorsvevijesti;''' + cursor.execute(query) + + conn.commit() + cursor.close() + +def create_db(conn): conn = psycopg2.connect( host=host, port=port, @@ -97,19 +236,18 @@ def create_db(): register_vector(conn) - cursor.execute("DROP TABLE IF EXISTS vectorsvevijesti;") - cursor.execute(''' - CREATE TABLE vectorsvevijesti ( + CREATE TABLE IF NOT EXISTS vectorsvevijesti ( id bigserial PRIMARY KEY, title VARCHAR, text VARCHAR, link VARCHAR, - embedding vector(1536) + embedding vector(1536), + similar_d VARCHAR, + time TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); ''') conn.commit() cursor.close() - conn.close() -create_db() \ No newline at end of file +create_db(conn) diff --git a/pyth/web-server.py b/pyth/web-server.py new file mode 100644 index 0000000..ae78c2b --- /dev/null +++ b/pyth/web-server.py @@ -0,0 +1,24 @@ +from flask import Flask , render_template , jsonify +from vectData import get_data +from flask_cors import CORS + + +app = Flask(__name__) + +CORS(app) + +@app.route('/') +def index() : + return render_template("index.html") + + +@app.route('/article/one') +def articleone(): + return render_template("one.html") + + +@app.route('/article/two') +def articletwo(): + return render_template("two.html") + +app.run(debug=True) \ No newline at end of file