Merge branch 'svevijesti-cs' into 'master'

Combine similar article

See merge request kbr4/svevijesti!5
This commit was merged in pull request #5.
This commit is contained in:
2024-01-08 09:45:03 +00:00
17 changed files with 856 additions and 83 deletions

7
pyth/.env Normal file
View File

@@ -0,0 +1,7 @@
OPENAI_API_KEY = "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"
DB_HOST =localhost
DB_PORT =5432
DB_USER =postgres
DB_PASSWORD =salmonela pljusti 221 hamo
DB_NAME =svevijestiweb

21
pyth/.gitlab-ci.yml Normal file
View File

@@ -0,0 +1,21 @@
stages:
- test
variables:
before_script:
- pip install -r requirements.txt
test_file1:
stage: test
script:
- python -m pytest tests/test_scrapingsingle.py
only:
- master
test_file2:
stage: test
script:
- python -m pytest tests/test_vectData.py
only:
- master

Binary file not shown.

Binary file not shown.

Binary file not shown.

241
pyth/articles.py Normal file
View File

@@ -0,0 +1,241 @@
import psycopg2
import numpy as np
from dotenv import load_dotenv
import os
from openai import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings
from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens
import json
from json_repair import repair_json
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI()
embeddings = OpenAIEmbeddings()
print(f"Checking for similar!")
def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
try:
titles, links, embeddings = get_titles_links_embeddings()
processed_articles = set()
grouped_similar_articles = []
for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
if (title1, link1) not in processed_articles:
processed_articles.add((title1, link1))
group = [(title1, link1)]
for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
if i != j and (title2, link2) not in processed_articles:
similarity = calculate_cosine_similarity(embedding1, embedding2)
if similarity > threshold:
processed_articles.add((title2, link2))
group.append((title2, link2))
grouped_similar_articles.append(group)
return grouped_similar_articles
except psycopg2.Error as e:
print(f"Error: {e}")
return []
def processing_similar():
grouped_similar_articles_result = find_and_group_similar_articles()
if grouped_similar_articles_result:
for group in grouped_similar_articles_result:
articles = []
if len(group) > 1:
for article_tuple in group:
if len(article_tuple) >= 2:
title, link = article_tuple[:2]
article = [title, link]
articles.append(article)
l = len(articles)
if l == 2:
a_one = articles[0][0]
a_two = articles[1][0]
get_one = get_specific_data(a_one)
get_two = get_specific_data(a_two)
text1 = get_one[0][1]
text2 = get_two[0][1]
link1 = get_one[0][2]
link2 = get_two[0][2]
if link1 != link2:
link = f"{link1}, {link2}"
else:
link = link1
ftoks = num_tokens_from_string(text1)
stoks = num_tokens_from_string(text2)
tokens = ftoks + stoks
similar_d = f"C: {a_one}, {a_two}"
modify_similar_data(similar_d, a_one)
preparing_articles(False, a_one)
modify_similar_data(similar_d, a_two)
preparing_articles(False, a_two)
if tokens > 2000:
combined_text = f"{text1} {text2}"
combined_text = slice_text_at_2k_tokens(combined_text)
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
else:
user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
if l == 3:
a_one = articles[0][0]
a_two = articles[1][0]
a_three = articles[2][0]
get_one = get_specific_data(a_one)
get_two = get_specific_data(a_two)
get_three = get_specific_data(a_three)
text1 = get_one[0][1]
text2 = get_two[0][1]
text3 = get_three[0][1]
link1 = get_one[0][2]
link2 = get_two[0][2]
link3 = get_three[0][2]
if link1 != link2:
if link2 != link3:
link = f"{link1}, {link2}, {link3}"
else:
link = f"{link1}, {link2}"
else:
if link2 != link3:
link = f"{link1}, {link3}"
else:
link = link1
ftoks = num_tokens_from_string(text1)
stoks = num_tokens_from_string(text2)
ttoks = num_tokens_from_string(text3)
tokens = ftoks + stoks + ttoks
similar_d = f"C: {a_one}, {a_two}, {a_three}"
modify_similar_data(similar_d, a_one)
preparing_articles(False, a_one)
modify_similar_data(similar_d, a_two)
preparing_articles(False, a_two)
modify_similar_data(similar_d, a_three)
preparing_articles(False, a_three)
if tokens > 2000:
combined_text = f"{text1} {text2} {text3}"
combined_text = slice_text_at_2k_tokens(combined_text)
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
else:
user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
if l == 4:
a_one = articles[0][0]
a_two = articles[1][0]
a_three = articles[2][0]
a_four = articles[3][0]
get_one = get_specific_data(a_one)
get_two = get_specific_data(a_two)
get_three = get_specific_data(a_three)
get_four = get_specific_data(a_four)
text1 = get_one[0][1]
text2 = get_two[0][1]
text3 = get_three[0][1]
text4 = get_four[0][1]
link1 = get_one[0][2]
link2 = get_two[0][2]
link3 = get_three[0][2]
link4 = get_four[0][2]
if link1 != link2:
if link2 != link3:
if link3 != link4:
link = f"{link1}, {link2}, {link3}, {link4}"
else:
link = f"{link1}, {link2}, {link3}"
else:
if link3 != link4:
link = f"{link1}, {link2}, {link4}"
else:
link = f"{link1}, {link2}"
else:
if link2 != link3:
if link3 != link4:
link = f"{link1}, {link3}, {link4}"
else:
link = f"{link1}, {link3}"
else:
if link3 != link4:
link = f"{link1}, {link4}"
else:
link = link1
ftoks = num_tokens_from_string(text1)
stoks = num_tokens_from_string(text2)
ttoks = num_tokens_from_string(text3)
frtoks = num_tokens_from_string(text4)
tokens = ftoks + stoks + ttoks + frtoks
similar_d = f"C: {a_one}, {a_two}, {a_three}, {a_four}"
modify_similar_data(similar_d, a_one)
preparing_articles(False, a_one)
modify_similar_data(similar_d, a_two)
preparing_articles(False, a_two)
modify_similar_data(similar_d, a_three)
preparing_articles(False, a_three)
modify_similar_data(similar_d, a_four)
preparing_articles(False, a_four)
if tokens > 2000:
combined_text = f"{text1} {text2} {text3} {text4}"
combined_text = slice_text_at_2k_tokens(combined_text)
user_message = rf"Here is text {combined_text}, combined from 4 sources, filter text, and make news content, return as JSON only with a single 'content' field"
else:
user_message = rf"Here are 4 texts {text1} {text2} {text3} and {text4}, combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single 'content' field."
try:
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
{"role": "user", "content": user_message}
])
generated_text = completion.choices[0].message.content
generated_text = repair_json(generated_text)
response_data = json.loads(generated_text)
title = a_one
text = response_data["content"]
vector = embeddings.embed_query(generated_text)
insert_data(title, text, link, vector, similar_d)
print(f"Inserting combined: {title}")
except Exception as e:
print(f"Error: {e}")
print(a_one)
continue
else:
print("Done!.")
else:
print("No similar articles found.")
if __name__=="__main__":
processing_similar()

141
pyth/requirements.txt Normal file
View File

@@ -0,0 +1,141 @@
aiohttp==3.9.1
aiosignal==1.3.1
annotated-types==0.6.0
anyio==4.2.0
apturl==0.5.2
async-timeout==4.0.3
attrs==23.1.0
beautifulsoup4==4.12.2
blinker==1.7.0
blis==0.7.11
Brlapi==0.8.3
catalogue==2.0.10
certifi==2020.6.20
chardet==4.0.0
charset-normalizer==3.3.2
click==8.1.7
cloudpathlib==0.16.0
colorama==0.4.4
command-not-found==0.3
confection==0.1.4
cryptography==3.4.8
cupshelpers==1.0
cymem==2.0.8
dataclasses-json==0.6.3
DateTime==5.4
dbus-python==1.2.18
decorator==4.4.2
defer==1.0.6
distro==1.7.0
distro-info==1.1+ubuntu0.1
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
exceptiongroup==1.2.0
Flask==3.0.0
Flask-Cors==4.0.0
frozenlist==1.4.1
greenlet==1.1.2
gyp==0.1
h11==0.14.0
httpcore==1.0.2
httplib2==0.20.2
httpx==0.25.2
idna==3.3
importlib-metadata==4.6.4
itsdangerous==2.1.2
jeepney==0.7.1
Jinja2==3.1.2
joblib==1.3.2
jsonpatch==1.33
jsonpointer==2.4
keyring==23.5.0
langchain==0.0.352
langchain-community==0.0.6
langchain-core==0.1.3
langcodes==3.3.0
langsmith==0.0.74
language-selector==0.1
launchpadlib==1.10.16
lazr.restfulclient==0.14.4
lazr.uri==1.0.6
louis==3.20.0
macaroonbakery==1.3.1
MarkupSafe==2.1.3
marshmallow==3.20.1
more-itertools==8.10.0
multidict==6.0.4
murmurhash==1.0.10
mypy-extensions==1.0.0
netifaces==0.11.0
numpy==1.26.2
oauthlib==3.2.0
olefile==0.46
openai==1.5.0
packaging==23.2
pbr==5.8.0
pexpect==4.8.0
pgvector==0.2.4
Pillow==9.0.1
preshed==3.0.9
protobuf==3.12.4
psycopg==3.1.15
psycopg2-binary==2.9.9
ptyprocess==0.7.0
pycairo==1.20.1
pycups==2.0.1
pydantic==2.5.2
pydantic_core==2.14.5
PyGObject==3.42.1
PyJWT==2.3.0
pymacaroons==0.13.0
PyNaCl==1.5.0
pyparsing==2.4.7
pyRFC3339==1.1
python-apt==2.4.0+ubuntu2
python-dateutil==2.8.1
python-debian==0.1.43+ubuntu1.1
python-dotenv==1.0.0
pytz==2022.1
pyxdg==0.27
PyYAML==5.4.1
regex==2023.10.3
reportlab==3.6.8
requests==2.31.0
scikit-learn==1.3.2
scipy==1.11.4
SecretStorage==3.3.1
six==1.16.0
slugify==0.0.1
smart-open==6.4.0
sniffio==1.3.0
soupsieve==2.5
spacy==3.7.2
spacy-legacy==3.0.12
spacy-loggers==1.0.5
SQLAlchemy==1.4.31
sqlalchemy-migrate==0.13.0
sqlparse==0.4.2
srsly==2.4.8
systemd-python==234
Tempita==0.5.2
tenacity==8.2.3
thinc==8.2.2
threadpoolctl==3.2.0
tiktoken==0.5.2
tqdm==4.66.1
typer==0.9.0
typing-inspect==0.9.0
typing_extensions==4.9.0
ubuntu-advantage-tools==8001
ubuntu-drivers-common==0.0.0
ufw==0.36.1
unattended-upgrades==0.1
urllib3==1.26.5
wadllib==1.3.6
wasabi==1.1.2
weasel==0.3.4
Werkzeug==3.0.1
xdg==5
xkit==0.0.0
yarl==1.9.4
zipp==1.0.0
zope.interface==6.1

View File

@@ -1,21 +1,66 @@
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
from openai import OpenAI
from openai import OpenAI
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector
from vectData import insert_data ,is_similar_data
from vectData import (insert_data ,is_similar_data ,get_all_links,cleansing )
import json
from dotenv import load_dotenv
import tiktoken
from json_repair import repair_json
load_dotenv()
cleansing()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = "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"
client = OpenAI()
embeddings = OpenAIEmbeddings()
dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:
encoding = tiktoken.encoding_for_model(model)
return len(encoding.encode(string))
def slice_text_at_2k_tokens(text):
encoding_name = "gpt-3.5-turbo"
max_tokens = 1950
encoding = tiktoken.encoding_for_model(encoding_name)
tokens = encoding.encode(text)
if len(tokens) <= max_tokens:
return [text]
sliced_tokens = tokens[:max_tokens]
sliced_text = encoding.decode(sliced_tokens)
return sliced_text
def slice_title_if_needed(text):
encoding_name = "gpt-3.5-turbo"
max_tokens = 100
encoding = tiktoken.encoding_for_model(encoding_name)
tokens = encoding.encode(text)
if len(tokens) <= max_tokens:
return [text]
sliced_tokens = tokens[:max_tokens]
sliced_text = encoding.decode(sliced_tokens)
return sliced_text
def replace_with_spaces(text):
allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 "
cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)
return cleaned_text
def fix_links(links_set):
modified_links = set()
for link in links_set:
if "www" in link:
modified_link = link.replace("www.", "")
modified_links.add(modified_link)
else:
modified_links.add(link)
return modified_links
total_links = set()
collected_news = set()
@@ -42,10 +87,17 @@ for dlink in dlinks:
temp_links = get_article_links(dlink, already_checked)
if temp_links:
total_links.update(temp_links)
final_links = {item for item in total_links if item}
for link in final_links:
db_links = set(get_all_links())
new_links = final_links - db_links
final_links = new_links
final_links = set(final_links)
final_links = fix_links(final_links)
if __name__ == '__main__':
for link in final_links:
response = requests.get(link,headers)
soup = BeautifulSoup(response.text, 'html.parser')
@@ -54,34 +106,41 @@ for link in final_links:
texts = soup.find_all(['p'])
text_text = ' '.join([text.get_text(strip=True) for text in texts])
text_text = text_text
title_text = title_text
title_text = replace_with_spaces(title_text)
text_text = slice_text_at_2k_tokens(text_text)
text_text = replace_with_spaces(str(text_text))
ttk = num_tokens_from_string(text_text)
if ttk > 1900:
title_text = slice_title_if_needed(title_text)
try:
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data make sure that its valid JSON object with 'title' field and 'content' field."}
]
)
generated_text = completion.choices[0].message.content
generated_text = repair_json(generated_text)
response_data = json.loads(generated_text)
title = response_data["title"]
text = response_data["content"]
print("*********************************")
print(f"Title: {title}")
print("---------------------------------")
print(f"Content : {text}")
print("*********************************")
vector = embeddings.embed_query(generated_text)
if not is_similar_data(title, text, link, vector, threshold=0.9):
insert_data(title, text, link, vector)
if not is_similar_data(title, text, link, vector, threshold=0.98):
similar_d = "NO"
insert_data(title, text, link, vector,similar_d)
except Exception as e:
print(f"Error in completion: {e}")
continue

22
pyth/templates/index.html Normal file
View File

@@ -0,0 +1,22 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Test Pyth</title>
</head>
<body>
<div>
<article>
<h2>Test Title 1</h2>
<p>Test Text 1</p>
<a href="/article/one"> First</a>
</article>
<article>
<h2>Test Title 2</h2>
<p>Test Text 2</p>
<a href="/article/two">Second</a>
</article>
</div>
</body>
</html>

12
pyth/templates/one.html Normal file
View File

@@ -0,0 +1,12 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Article</title>
</head>
<body>
<h2>Test Title</h2>
<p>Test Text</p>
</body>
</html>

12
pyth/templates/two.html Normal file
View File

@@ -0,0 +1,12 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Article</title>
</head>
<body>
<h2>Test Title</h2>
<p>Test Text</p>
</body>
</html>

Binary file not shown.

View File

@@ -0,0 +1,60 @@
import unittest
from unittest.mock import patch
import requests
from bs4 import BeautifulSoup
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector
from openai import OpenAI
import json
from dotenv import load_dotenv
from scrapingsingle import get_article_links, insert_data, is_similar_data
import os
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI()
embeddings = OpenAIEmbeddings()
already_checked = set()
total_links = set()
collected_news = set()
dlinks = 'http://127.0.0.1:5000/'
class TestIntegration(unittest.TestCase):
def test_integration(self):
link = get_article_links(dlinks,already_checked)
self.assertEqual(len(already_checked), 2)
for link in total_links:
response = requests.get(link)
soup = BeautifulSoup(response.text, 'html.parser')
titles = soup.find_all(['h2', 'h1', 'h3'])
title_text = ' '.join([title.get_text(strip=True) for title in titles])
texts = soup.find_all(['p'])
text_text = ' '.join([text.get_text(strip=True) for text in texts])
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
]
)
generated_text = completion.choices[0].message.content
response_data = json.loads(generated_text)
title = response_data["title"]
text = response_data["content"]
vector = embeddings.embed_query(generated_text)
self.assertIn("Test Title", title)
self.assertIn("Test Text", text)
self.assertEqual(len(total_links), 2)

View File

@@ -0,0 +1,89 @@
import unittest
import numpy as np
import psycopg2
import os
from vectData import calculate_cosine_similarity, is_similar_data, insert_data, get_data, create_db
class TestIntegration(unittest.TestCase):
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
dbname = os.getenv("DB_NAME")
@classmethod
def setUpClass(cls):
cls.host = os.getenv("DB_HOST")
cls.port = os.getenv("DB_PORT")
cls.user = os.getenv("DB_USER")
cls.password = os.getenv("DB_PASSWORD")
cls.dbname = os.getenv("DB_NAME")
cls.conn = psycopg2.connect(
host=cls.host,
port=cls.port,
user=cls.user,
password=cls.password,
dbname=cls.dbname
)
create_db(cls.conn)
@classmethod
def tearDownClass(cls):
cls.conn.close()
def setUp(self):
if self.conn.closed:
self.conn = psycopg2.connect(
host=self.host,
port=self.port,
user=self.user,
password=self.password,
dbname=self.dbname
)
self.cursor = self.conn.cursor()
def tearDown(self):
if not self.cursor.closed:
self.cursor.close()
if not self.conn.closed:
self.conn.close()
def test_insert_and_retrieve_data(self):
title = 'test_title'
text = 'test_text'
link = 'test_link'
embedding = np.arange(1, 1537)
insert_data(title, text, link, embedding)
data = get_data()
self.assertEqual(data, [(title, text, link)])
def test_is_similar_data_integration(self):
title = 'test_title'
text = 'test_text'
link = 'test_link'
embedding = np.arange(1, 1537)
insert_data(title, text, link, embedding)
result = is_similar_data(title, text, link, embedding)
self.assertTrue(result)
result = is_similar_data(title, text, link, embedding)
self.assertTrue(result)
result = is_similar_data(title, text, link, embedding)
self.assertTrue(result)
def test_create_db_integration(self):
cursor = self.conn.cursor()
cursor.execute("SELECT * FROM information_schema.tables WHERE table_name = 'vectorsvevijesti'")
table_exist = bool(cursor.fetchone())
self.assertTrue(table_exist)
if __name__ == '__main__':
unittest.main()

View File

@@ -3,113 +3,193 @@ from psycopg2 import sql
from pgvector.psycopg2 import register_vector
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
from dotenv import load_dotenv
from datetime import datetime ,timedelta
host = 'localhost'
port = '5432'
user = 'postgres'
password = 'salmonela pljusti 221 hamo'
dbname = 'vector_svw'
load_dotenv()
def calculate_cosine_similarity(v1, v2):
v1_normalized = v1 / np.linalg.norm(v1)
v2_normalized = v2 / np.linalg.norm(v2)
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
dbname = os.getenv("DB_NAME")
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
return similarity
def is_similar_data(title, text, link, embedding, threshold=0.9):
conn = psycopg2.connect(
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
cursor = conn.cursor()
cursor.execute('SELECT title ,embedding FROM vectorsvevijesti;')
def calculate_cosine_similarity(v1, v2):
v1_normalized = v1 / np.linalg.norm(v1)
v2_normalized = v2 / np.linalg.norm(v2)
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
return similarity
def parse_embedding_string(embedding_str):
if isinstance(embedding_str, str):
numbers = [float(num) for num in embedding_str[1:-1].split(',')]
return np.array(numbers)
elif isinstance(embedding_str, np.ndarray):
return embedding_str
else:
raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.")
def is_similar_data(title, text, link, embedding, threshold=0.98):
cursor = conn.cursor()
cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;')
existing_embeddings = cursor.fetchall()
for existing_embedding_tuple in existing_embeddings:
existing_title = existing_embedding_tuple[0]
existing_embedding = np.array(existing_embedding_tuple[1]).flatten()
existing_link = existing_embedding_tuple[2]
similarity = calculate_cosine_similarity(existing_embedding, embedding)
if similarity > threshold:
print(f"Similar data found: \n #{title} \n #{existing_title}")
cursor.close()
conn.close()
return True
if link != existing_link:
similar_d = existing_title
insert_data(title,text,link,embedding,similar_d)
print(f"Similar data found: \n #{title} \n #{existing_title}")
print(f"Inserting: #{title}")
similar_d = "NO"
cursor.close()
return True
else:
print(f"Same article of same source!")
cursor.close()
return True
print(f"Inserting: #{title}")
cursor.close()
conn.close()
return False
def insert_data(title, text, link, embedding):
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
def get_similar():
cursor = conn.cursor()
cursor.execute('''
INSERT INTO vectorsvevijesti (title, text, link, embedding)
VALUES (%s, %s, %s, %s);
''', (title, text, link, embedding))
conn.commit()
query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
cursor.execute(query)
similar_data = cursor.fetchall()
cursor.close()
return similar_data
def get_titles_links_embeddings():
cursor = conn.cursor()
cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;')
data = cursor.fetchall()
cursor.close()
titles = [row[0] for row in data]
links = [row[1] for row in data]
embeddings = [parse_embedding_string(row[2]) for row in data]
return titles, links, embeddings
def insert_data(title, text, link, embedding, similar_d):
c_time = datetime.now()
cursor = conn.cursor()
cursor.execute('''
INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready)
VALUES (%s, %s, %s, %s, %s ,%s ,%s);
''', (title, text, link, embedding , similar_d, c_time, True))
conn.commit()
cursor.close()
conn.close()
def get_data():
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
cursor = conn.cursor()
query = '''SELECT title,text,link FROM vectorsvevijesti;'''
cursor.execute(query)
data = cursor.fetchall()
cursor.close()
conn.close()
return data
def create_db():
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
def get_ready_data():
cursor = conn.cursor()
query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
cursor.execute(query, ('True',))
data = cursor.fetchall()
cursor.close()
return data
def get_source_data():
cursor = conn.cursor()
query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
cursor.execute(query, ('False',))
data = cursor.fetchall()
cursor.close()
return data
def modify_similar_data(new_value ,title):
cursor = conn.cursor()
query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s '''
cursor.execute(query, (new_value, title))
conn.commit()
def preparing_articles(new_value ,title):
cursor = conn.cursor()
query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s '''
cursor.execute(query, (new_value, title))
conn.commit()
def get_specific_data(title):
cursor = conn.cursor()
query = '''SELECT title, text, link, similar_d, embedding, ready FROM vectorsvevijesti WHERE title = %s'''
cursor.execute(query, (title,))
specific_post = cursor.fetchall()
cursor.close()
return specific_post
def get_all_links():
cursor = conn.cursor()
query = '''SELECT link FROM vectorsvevijesti'''
cursor.execute(query)
db_links = {link[0] for link in cursor.fetchall()}
cursor.close()
return db_links
def delete_specific(title):
cursor = conn.cursor()
query = '''DELETE FROM vectorsvevijesti WHERE title = %s'''
cursor.execute(query,(title,))
cursor.close()
def cleansing():
day_long = datetime.now() - timedelta(days=1)
cursor = conn.cursor()
query = '''DELETE FROM vectorsvevijesti WHERE time < %s'''
cursor.execute(query,(day_long,))
conn.commit()
cursor.close()
def drop_table():
cursor = conn.cursor()
query = '''DROP TABLE IF EXISTS vectorsvevijesti;'''
cursor.execute(query)
conn.commit()
cursor.close()
def create_db():
cursor = conn.cursor()
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
register_vector(conn)
cursor.execute("DROP TABLE IF EXISTS vectorsvevijesti;")
cursor.execute('''
CREATE TABLE vectorsvevijesti (
CREATE TABLE IF NOT EXISTS vectorsvevijesti (
id bigserial PRIMARY KEY,
title VARCHAR,
text VARCHAR,
link VARCHAR,
embedding vector(1536)
embedding vector(1536),
similar_d VARCHAR,
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
ready BOOLEAN
);
''')
conn.commit()
cursor.close()
conn.close()
create_db()
create_db()

29
pyth/web-server.py Normal file
View File

@@ -0,0 +1,29 @@
from flask import Flask , render_template , jsonify
from vectData import get_ready_data
from flask_cors import CORS
app = Flask(__name__)
CORS(app)
@app.route('/')
def index() :
return render_template("index.html")
@app.route('/article/one')
def articleone():
return render_template("one.html")
@app.route('/article/two')
def articletwo():
return render_template("two.html")
@app.route('/data/get/news', methods=['GET'])
def takenews():
data = get_ready_data()
return jsonify(data)
app.run(debug=True)