Merge branch 'svevijesti-cs' into 'master'

Combine similar article

See merge request kbr4/svevijesti!5
This commit was merged in pull request #5.
This commit is contained in:
2024-01-08 09:45:03 +00:00
17 changed files with 856 additions and 83 deletions

7
pyth/.env Normal file
View File

@@ -0,0 +1,7 @@
OPENAI_API_KEY = "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"
DB_HOST =localhost
DB_PORT =5432
DB_USER =postgres
DB_PASSWORD =salmonela pljusti 221 hamo
DB_NAME =svevijestiweb

21
pyth/.gitlab-ci.yml Normal file
View File

@@ -0,0 +1,21 @@
stages:
- test
variables:
before_script:
- pip install -r requirements.txt
test_file1:
stage: test
script:
- python -m pytest tests/test_scrapingsingle.py
only:
- master
test_file2:
stage: test
script:
- python -m pytest tests/test_vectData.py
only:
- master

Binary file not shown.

Binary file not shown.

Binary file not shown.

241
pyth/articles.py Normal file
View File

@@ -0,0 +1,241 @@
import psycopg2
import numpy as np
from dotenv import load_dotenv
import os
from openai import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings
from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens
import json
from json_repair import repair_json
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI()
embeddings = OpenAIEmbeddings()
print(f"Checking for similar!")
def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
try:
titles, links, embeddings = get_titles_links_embeddings()
processed_articles = set()
grouped_similar_articles = []
for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
if (title1, link1) not in processed_articles:
processed_articles.add((title1, link1))
group = [(title1, link1)]
for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
if i != j and (title2, link2) not in processed_articles:
similarity = calculate_cosine_similarity(embedding1, embedding2)
if similarity > threshold:
processed_articles.add((title2, link2))
group.append((title2, link2))
grouped_similar_articles.append(group)
return grouped_similar_articles
except psycopg2.Error as e:
print(f"Error: {e}")
return []
def processing_similar():
grouped_similar_articles_result = find_and_group_similar_articles()
if grouped_similar_articles_result:
for group in grouped_similar_articles_result:
articles = []
if len(group) > 1:
for article_tuple in group:
if len(article_tuple) >= 2:
title, link = article_tuple[:2]
article = [title, link]
articles.append(article)
l = len(articles)
if l == 2:
a_one = articles[0][0]
a_two = articles[1][0]
get_one = get_specific_data(a_one)
get_two = get_specific_data(a_two)
text1 = get_one[0][1]
text2 = get_two[0][1]
link1 = get_one[0][2]
link2 = get_two[0][2]
if link1 != link2:
link = f"{link1}, {link2}"
else:
link = link1
ftoks = num_tokens_from_string(text1)
stoks = num_tokens_from_string(text2)
tokens = ftoks + stoks
similar_d = f"C: {a_one}, {a_two}"
modify_similar_data(similar_d, a_one)
preparing_articles(False, a_one)
modify_similar_data(similar_d, a_two)
preparing_articles(False, a_two)
if tokens > 2000:
combined_text = f"{text1} {text2}"
combined_text = slice_text_at_2k_tokens(combined_text)
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
else:
user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
if l == 3:
a_one = articles[0][0]
a_two = articles[1][0]
a_three = articles[2][0]
get_one = get_specific_data(a_one)
get_two = get_specific_data(a_two)
get_three = get_specific_data(a_three)
text1 = get_one[0][1]
text2 = get_two[0][1]
text3 = get_three[0][1]
link1 = get_one[0][2]
link2 = get_two[0][2]
link3 = get_three[0][2]
if link1 != link2:
if link2 != link3:
link = f"{link1}, {link2}, {link3}"
else:
link = f"{link1}, {link2}"
else:
if link2 != link3:
link = f"{link1}, {link3}"
else:
link = link1
ftoks = num_tokens_from_string(text1)
stoks = num_tokens_from_string(text2)
ttoks = num_tokens_from_string(text3)
tokens = ftoks + stoks + ttoks
similar_d = f"C: {a_one}, {a_two}, {a_three}"
modify_similar_data(similar_d, a_one)
preparing_articles(False, a_one)
modify_similar_data(similar_d, a_two)
preparing_articles(False, a_two)
modify_similar_data(similar_d, a_three)
preparing_articles(False, a_three)
if tokens > 2000:
combined_text = f"{text1} {text2} {text3}"
combined_text = slice_text_at_2k_tokens(combined_text)
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
else:
user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
if l == 4:
a_one = articles[0][0]
a_two = articles[1][0]
a_three = articles[2][0]
a_four = articles[3][0]
get_one = get_specific_data(a_one)
get_two = get_specific_data(a_two)
get_three = get_specific_data(a_three)
get_four = get_specific_data(a_four)
text1 = get_one[0][1]
text2 = get_two[0][1]
text3 = get_three[0][1]
text4 = get_four[0][1]
link1 = get_one[0][2]
link2 = get_two[0][2]
link3 = get_three[0][2]
link4 = get_four[0][2]
if link1 != link2:
if link2 != link3:
if link3 != link4:
link = f"{link1}, {link2}, {link3}, {link4}"
else:
link = f"{link1}, {link2}, {link3}"
else:
if link3 != link4:
link = f"{link1}, {link2}, {link4}"
else:
link = f"{link1}, {link2}"
else:
if link2 != link3:
if link3 != link4:
link = f"{link1}, {link3}, {link4}"
else:
link = f"{link1}, {link3}"
else:
if link3 != link4:
link = f"{link1}, {link4}"
else:
link = link1
ftoks = num_tokens_from_string(text1)
stoks = num_tokens_from_string(text2)
ttoks = num_tokens_from_string(text3)
frtoks = num_tokens_from_string(text4)
tokens = ftoks + stoks + ttoks + frtoks
similar_d = f"C: {a_one}, {a_two}, {a_three}, {a_four}"
modify_similar_data(similar_d, a_one)
preparing_articles(False, a_one)
modify_similar_data(similar_d, a_two)
preparing_articles(False, a_two)
modify_similar_data(similar_d, a_three)
preparing_articles(False, a_three)
modify_similar_data(similar_d, a_four)
preparing_articles(False, a_four)
if tokens > 2000:
combined_text = f"{text1} {text2} {text3} {text4}"
combined_text = slice_text_at_2k_tokens(combined_text)
user_message = rf"Here is text {combined_text}, combined from 4 sources, filter text, and make news content, return as JSON only with a single 'content' field"
else:
user_message = rf"Here are 4 texts {text1} {text2} {text3} and {text4}, combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single 'content' field."
try:
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
{"role": "user", "content": user_message}
])
generated_text = completion.choices[0].message.content
generated_text = repair_json(generated_text)
response_data = json.loads(generated_text)
title = a_one
text = response_data["content"]
vector = embeddings.embed_query(generated_text)
insert_data(title, text, link, vector, similar_d)
print(f"Inserting combined: {title}")
except Exception as e:
print(f"Error: {e}")
print(a_one)
continue
else:
print("Done!.")
else:
print("No similar articles found.")
if __name__=="__main__":
processing_similar()

141
pyth/requirements.txt Normal file
View File

@@ -0,0 +1,141 @@
aiohttp==3.9.1
aiosignal==1.3.1
annotated-types==0.6.0
anyio==4.2.0
apturl==0.5.2
async-timeout==4.0.3
attrs==23.1.0
beautifulsoup4==4.12.2
blinker==1.7.0
blis==0.7.11
Brlapi==0.8.3
catalogue==2.0.10
certifi==2020.6.20
chardet==4.0.0
charset-normalizer==3.3.2
click==8.1.7
cloudpathlib==0.16.0
colorama==0.4.4
command-not-found==0.3
confection==0.1.4
cryptography==3.4.8
cupshelpers==1.0
cymem==2.0.8
dataclasses-json==0.6.3
DateTime==5.4
dbus-python==1.2.18
decorator==4.4.2
defer==1.0.6
distro==1.7.0
distro-info==1.1+ubuntu0.1
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
exceptiongroup==1.2.0
Flask==3.0.0
Flask-Cors==4.0.0
frozenlist==1.4.1
greenlet==1.1.2
gyp==0.1
h11==0.14.0
httpcore==1.0.2
httplib2==0.20.2
httpx==0.25.2
idna==3.3
importlib-metadata==4.6.4
itsdangerous==2.1.2
jeepney==0.7.1
Jinja2==3.1.2
joblib==1.3.2
jsonpatch==1.33
jsonpointer==2.4
keyring==23.5.0
langchain==0.0.352
langchain-community==0.0.6
langchain-core==0.1.3
langcodes==3.3.0
langsmith==0.0.74
language-selector==0.1
launchpadlib==1.10.16
lazr.restfulclient==0.14.4
lazr.uri==1.0.6
louis==3.20.0
macaroonbakery==1.3.1
MarkupSafe==2.1.3
marshmallow==3.20.1
more-itertools==8.10.0
multidict==6.0.4
murmurhash==1.0.10
mypy-extensions==1.0.0
netifaces==0.11.0
numpy==1.26.2
oauthlib==3.2.0
olefile==0.46
openai==1.5.0
packaging==23.2
pbr==5.8.0
pexpect==4.8.0
pgvector==0.2.4
Pillow==9.0.1
preshed==3.0.9
protobuf==3.12.4
psycopg==3.1.15
psycopg2-binary==2.9.9
ptyprocess==0.7.0
pycairo==1.20.1
pycups==2.0.1
pydantic==2.5.2
pydantic_core==2.14.5
PyGObject==3.42.1
PyJWT==2.3.0
pymacaroons==0.13.0
PyNaCl==1.5.0
pyparsing==2.4.7
pyRFC3339==1.1
python-apt==2.4.0+ubuntu2
python-dateutil==2.8.1
python-debian==0.1.43+ubuntu1.1
python-dotenv==1.0.0
pytz==2022.1
pyxdg==0.27
PyYAML==5.4.1
regex==2023.10.3
reportlab==3.6.8
requests==2.31.0
scikit-learn==1.3.2
scipy==1.11.4
SecretStorage==3.3.1
six==1.16.0
slugify==0.0.1
smart-open==6.4.0
sniffio==1.3.0
soupsieve==2.5
spacy==3.7.2
spacy-legacy==3.0.12
spacy-loggers==1.0.5
SQLAlchemy==1.4.31
sqlalchemy-migrate==0.13.0
sqlparse==0.4.2
srsly==2.4.8
systemd-python==234
Tempita==0.5.2
tenacity==8.2.3
thinc==8.2.2
threadpoolctl==3.2.0
tiktoken==0.5.2
tqdm==4.66.1
typer==0.9.0
typing-inspect==0.9.0
typing_extensions==4.9.0
ubuntu-advantage-tools==8001
ubuntu-drivers-common==0.0.0
ufw==0.36.1
unattended-upgrades==0.1
urllib3==1.26.5
wadllib==1.3.6
wasabi==1.1.2
weasel==0.3.4
Werkzeug==3.0.1
xdg==5
xkit==0.0.0
yarl==1.9.4
zipp==1.0.0
zope.interface==6.1

View File

@@ -4,18 +4,63 @@ from urllib.parse import urljoin
from openai import OpenAI
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector
from vectData import insert_data ,is_similar_data
from vectData import (insert_data ,is_similar_data ,get_all_links,cleansing )
import json
from dotenv import load_dotenv
import tiktoken
from json_repair import repair_json
load_dotenv()
cleansing()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = "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"
client = OpenAI()
embeddings = OpenAIEmbeddings()
dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:
encoding = tiktoken.encoding_for_model(model)
return len(encoding.encode(string))
def slice_text_at_2k_tokens(text):
encoding_name = "gpt-3.5-turbo"
max_tokens = 1950
encoding = tiktoken.encoding_for_model(encoding_name)
tokens = encoding.encode(text)
if len(tokens) <= max_tokens:
return [text]
sliced_tokens = tokens[:max_tokens]
sliced_text = encoding.decode(sliced_tokens)
return sliced_text
def slice_title_if_needed(text):
encoding_name = "gpt-3.5-turbo"
max_tokens = 100
encoding = tiktoken.encoding_for_model(encoding_name)
tokens = encoding.encode(text)
if len(tokens) <= max_tokens:
return [text]
sliced_tokens = tokens[:max_tokens]
sliced_text = encoding.decode(sliced_tokens)
return sliced_text
def replace_with_spaces(text):
allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 "
cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)
return cleaned_text
def fix_links(links_set):
modified_links = set()
for link in links_set:
if "www" in link:
modified_link = link.replace("www.", "")
modified_links.add(modified_link)
else:
modified_links.add(link)
return modified_links
total_links = set()
collected_news = set()
@@ -42,10 +87,17 @@ for dlink in dlinks:
temp_links = get_article_links(dlink, already_checked)
if temp_links:
total_links.update(temp_links)
final_links = {item for item in total_links if item}
for link in final_links:
db_links = set(get_all_links())
new_links = final_links - db_links
final_links = new_links
final_links = set(final_links)
final_links = fix_links(final_links)
if __name__ == '__main__':
for link in final_links:
response = requests.get(link,headers)
soup = BeautifulSoup(response.text, 'html.parser')
@@ -55,33 +107,40 @@ for link in final_links:
texts = soup.find_all(['p'])
text_text = ' '.join([text.get_text(strip=True) for text in texts])
text_text = text_text
title_text = title_text
title_text = replace_with_spaces(title_text)
text_text = slice_text_at_2k_tokens(text_text)
text_text = replace_with_spaces(str(text_text))
ttk = num_tokens_from_string(text_text)
if ttk > 1900:
title_text = slice_title_if_needed(title_text)
try:
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data make sure that its valid JSON object with 'title' field and 'content' field."}
]
)
generated_text = completion.choices[0].message.content
response_data = json.loads(generated_text)
generated_text = repair_json(generated_text)
response_data = json.loads(generated_text)
title = response_data["title"]
text = response_data["content"]
print("*********************************")
print(f"Title: {title}")
print("---------------------------------")
print(f"Content : {text}")
print("*********************************")
vector = embeddings.embed_query(generated_text)
if not is_similar_data(title, text, link, vector, threshold=0.9):
insert_data(title, text, link, vector)
if not is_similar_data(title, text, link, vector, threshold=0.98):
similar_d = "NO"
insert_data(title, text, link, vector,similar_d)
except Exception as e:
print(f"Error in completion: {e}")
continue

22
pyth/templates/index.html Normal file
View File

@@ -0,0 +1,22 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Test Pyth</title>
</head>
<body>
<div>
<article>
<h2>Test Title 1</h2>
<p>Test Text 1</p>
<a href="/article/one"> First</a>
</article>
<article>
<h2>Test Title 2</h2>
<p>Test Text 2</p>
<a href="/article/two">Second</a>
</article>
</div>
</body>
</html>

12
pyth/templates/one.html Normal file
View File

@@ -0,0 +1,12 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Article</title>
</head>
<body>
<h2>Test Title</h2>
<p>Test Text</p>
</body>
</html>

12
pyth/templates/two.html Normal file
View File

@@ -0,0 +1,12 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Article</title>
</head>
<body>
<h2>Test Title</h2>
<p>Test Text</p>
</body>
</html>

Binary file not shown.

View File

@@ -0,0 +1,60 @@
import unittest
from unittest.mock import patch
import requests
from bs4 import BeautifulSoup
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector
from openai import OpenAI
import json
from dotenv import load_dotenv
from scrapingsingle import get_article_links, insert_data, is_similar_data
import os
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI()
embeddings = OpenAIEmbeddings()
already_checked = set()
total_links = set()
collected_news = set()
dlinks = 'http://127.0.0.1:5000/'
class TestIntegration(unittest.TestCase):
def test_integration(self):
link = get_article_links(dlinks,already_checked)
self.assertEqual(len(already_checked), 2)
for link in total_links:
response = requests.get(link)
soup = BeautifulSoup(response.text, 'html.parser')
titles = soup.find_all(['h2', 'h1', 'h3'])
title_text = ' '.join([title.get_text(strip=True) for title in titles])
texts = soup.find_all(['p'])
text_text = ' '.join([text.get_text(strip=True) for text in texts])
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
]
)
generated_text = completion.choices[0].message.content
response_data = json.loads(generated_text)
title = response_data["title"]
text = response_data["content"]
vector = embeddings.embed_query(generated_text)
self.assertIn("Test Title", title)
self.assertIn("Test Text", text)
self.assertEqual(len(total_links), 2)

View File

@@ -0,0 +1,89 @@
import unittest
import numpy as np
import psycopg2
import os
from vectData import calculate_cosine_similarity, is_similar_data, insert_data, get_data, create_db
class TestIntegration(unittest.TestCase):
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
dbname = os.getenv("DB_NAME")
@classmethod
def setUpClass(cls):
cls.host = os.getenv("DB_HOST")
cls.port = os.getenv("DB_PORT")
cls.user = os.getenv("DB_USER")
cls.password = os.getenv("DB_PASSWORD")
cls.dbname = os.getenv("DB_NAME")
cls.conn = psycopg2.connect(
host=cls.host,
port=cls.port,
user=cls.user,
password=cls.password,
dbname=cls.dbname
)
create_db(cls.conn)
@classmethod
def tearDownClass(cls):
cls.conn.close()
def setUp(self):
if self.conn.closed:
self.conn = psycopg2.connect(
host=self.host,
port=self.port,
user=self.user,
password=self.password,
dbname=self.dbname
)
self.cursor = self.conn.cursor()
def tearDown(self):
if not self.cursor.closed:
self.cursor.close()
if not self.conn.closed:
self.conn.close()
def test_insert_and_retrieve_data(self):
title = 'test_title'
text = 'test_text'
link = 'test_link'
embedding = np.arange(1, 1537)
insert_data(title, text, link, embedding)
data = get_data()
self.assertEqual(data, [(title, text, link)])
def test_is_similar_data_integration(self):
title = 'test_title'
text = 'test_text'
link = 'test_link'
embedding = np.arange(1, 1537)
insert_data(title, text, link, embedding)
result = is_similar_data(title, text, link, embedding)
self.assertTrue(result)
result = is_similar_data(title, text, link, embedding)
self.assertTrue(result)
result = is_similar_data(title, text, link, embedding)
self.assertTrue(result)
def test_create_db_integration(self):
cursor = self.conn.cursor()
cursor.execute("SELECT * FROM information_schema.tables WHERE table_name = 'vectorsvevijesti'")
table_exist = bool(cursor.fetchone())
self.assertTrue(table_exist)
if __name__ == '__main__':
unittest.main()

View File

@@ -3,113 +3,193 @@ from psycopg2 import sql
from pgvector.psycopg2 import register_vector
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
from dotenv import load_dotenv
from datetime import datetime ,timedelta
host = 'localhost'
port = '5432'
user = 'postgres'
password = 'salmonela pljusti 221 hamo'
dbname = 'vector_svw'
load_dotenv()
def calculate_cosine_similarity(v1, v2):
v1_normalized = v1 / np.linalg.norm(v1)
v2_normalized = v2 / np.linalg.norm(v2)
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
dbname = os.getenv("DB_NAME")
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
return similarity
def is_similar_data(title, text, link, embedding, threshold=0.9):
conn = psycopg2.connect(
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
cursor = conn.cursor()
cursor.execute('SELECT title ,embedding FROM vectorsvevijesti;')
def calculate_cosine_similarity(v1, v2):
v1_normalized = v1 / np.linalg.norm(v1)
v2_normalized = v2 / np.linalg.norm(v2)
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
return similarity
def parse_embedding_string(embedding_str):
if isinstance(embedding_str, str):
numbers = [float(num) for num in embedding_str[1:-1].split(',')]
return np.array(numbers)
elif isinstance(embedding_str, np.ndarray):
return embedding_str
else:
raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.")
def is_similar_data(title, text, link, embedding, threshold=0.98):
cursor = conn.cursor()
cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;')
existing_embeddings = cursor.fetchall()
for existing_embedding_tuple in existing_embeddings:
existing_title = existing_embedding_tuple[0]
existing_embedding = np.array(existing_embedding_tuple[1]).flatten()
existing_link = existing_embedding_tuple[2]
similarity = calculate_cosine_similarity(existing_embedding, embedding)
if similarity > threshold:
print(f"Similar data found: \n #{title} \n #{existing_title}")
cursor.close()
conn.close()
return True
if link != existing_link:
similar_d = existing_title
insert_data(title,text,link,embedding,similar_d)
print(f"Similar data found: \n #{title} \n #{existing_title}")
print(f"Inserting: #{title}")
similar_d = "NO"
cursor.close()
return True
else:
print(f"Same article of same source!")
cursor.close()
return True
print(f"Inserting: #{title}")
cursor.close()
conn.close()
return False
def insert_data(title, text, link, embedding):
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
def get_similar():
cursor = conn.cursor()
cursor.execute('''
INSERT INTO vectorsvevijesti (title, text, link, embedding)
VALUES (%s, %s, %s, %s);
''', (title, text, link, embedding))
conn.commit()
query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
cursor.execute(query)
similar_data = cursor.fetchall()
cursor.close()
return similar_data
def get_titles_links_embeddings():
cursor = conn.cursor()
cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;')
data = cursor.fetchall()
cursor.close()
titles = [row[0] for row in data]
links = [row[1] for row in data]
embeddings = [parse_embedding_string(row[2]) for row in data]
return titles, links, embeddings
def insert_data(title, text, link, embedding, similar_d):
c_time = datetime.now()
cursor = conn.cursor()
cursor.execute('''
INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready)
VALUES (%s, %s, %s, %s, %s ,%s ,%s);
''', (title, text, link, embedding , similar_d, c_time, True))
conn.commit()
cursor.close()
conn.close()
def get_data():
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
cursor = conn.cursor()
query = '''SELECT title,text,link FROM vectorsvevijesti;'''
cursor.execute(query)
data = cursor.fetchall()
cursor.close()
conn.close()
return data
def create_db():
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
def get_ready_data():
cursor = conn.cursor()
query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
cursor.execute(query, ('True',))
data = cursor.fetchall()
cursor.close()
return data
def get_source_data():
cursor = conn.cursor()
query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
cursor.execute(query, ('False',))
data = cursor.fetchall()
cursor.close()
return data
def modify_similar_data(new_value ,title):
cursor = conn.cursor()
query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s '''
cursor.execute(query, (new_value, title))
conn.commit()
def preparing_articles(new_value ,title):
cursor = conn.cursor()
query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s '''
cursor.execute(query, (new_value, title))
conn.commit()
def get_specific_data(title):
cursor = conn.cursor()
query = '''SELECT title, text, link, similar_d, embedding, ready FROM vectorsvevijesti WHERE title = %s'''
cursor.execute(query, (title,))
specific_post = cursor.fetchall()
cursor.close()
return specific_post
def get_all_links():
cursor = conn.cursor()
query = '''SELECT link FROM vectorsvevijesti'''
cursor.execute(query)
db_links = {link[0] for link in cursor.fetchall()}
cursor.close()
return db_links
def delete_specific(title):
cursor = conn.cursor()
query = '''DELETE FROM vectorsvevijesti WHERE title = %s'''
cursor.execute(query,(title,))
cursor.close()
def cleansing():
day_long = datetime.now() - timedelta(days=1)
cursor = conn.cursor()
query = '''DELETE FROM vectorsvevijesti WHERE time < %s'''
cursor.execute(query,(day_long,))
conn.commit()
cursor.close()
def drop_table():
cursor = conn.cursor()
query = '''DROP TABLE IF EXISTS vectorsvevijesti;'''
cursor.execute(query)
conn.commit()
cursor.close()
def create_db():
cursor = conn.cursor()
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
register_vector(conn)
cursor.execute("DROP TABLE IF EXISTS vectorsvevijesti;")
cursor.execute('''
CREATE TABLE vectorsvevijesti (
CREATE TABLE IF NOT EXISTS vectorsvevijesti (
id bigserial PRIMARY KEY,
title VARCHAR,
text VARCHAR,
link VARCHAR,
embedding vector(1536)
embedding vector(1536),
similar_d VARCHAR,
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
ready BOOLEAN
);
''')
conn.commit()
cursor.close()
conn.close()
create_db()

29
pyth/web-server.py Normal file
View File

@@ -0,0 +1,29 @@
from flask import Flask , render_template , jsonify
from vectData import get_ready_data
from flask_cors import CORS
app = Flask(__name__)
CORS(app)
@app.route('/')
def index() :
return render_template("index.html")
@app.route('/article/one')
def articleone():
return render_template("one.html")
@app.route('/article/two')
def articletwo():
return render_template("two.html")
@app.route('/data/get/news', methods=['GET'])
def takenews():
data = get_ready_data()
return jsonify(data)
app.run(debug=True)