Combine similar article
This commit is contained in:
7
pyth/.env
Normal file
7
pyth/.env
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
OPENAI_API_KEY = "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"
|
||||||
|
|
||||||
|
DB_HOST =localhost
|
||||||
|
DB_PORT =5432
|
||||||
|
DB_USER =postgres
|
||||||
|
DB_PASSWORD =salmonela pljusti 221 hamo
|
||||||
|
DB_NAME =svevijestiweb
|
||||||
21
pyth/.gitlab-ci.yml
Normal file
21
pyth/.gitlab-ci.yml
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
stages:
|
||||||
|
- test
|
||||||
|
|
||||||
|
variables:
|
||||||
|
|
||||||
|
before_script:
|
||||||
|
- pip install -r requirements.txt
|
||||||
|
|
||||||
|
test_file1:
|
||||||
|
stage: test
|
||||||
|
script:
|
||||||
|
- python -m pytest tests/test_scrapingsingle.py
|
||||||
|
only:
|
||||||
|
- master
|
||||||
|
|
||||||
|
test_file2:
|
||||||
|
stage: test
|
||||||
|
script:
|
||||||
|
- python -m pytest tests/test_vectData.py
|
||||||
|
only:
|
||||||
|
- master
|
||||||
BIN
pyth/__pycache__/scrapingsingle.cpython-310.pyc
Normal file
BIN
pyth/__pycache__/scrapingsingle.cpython-310.pyc
Normal file
Binary file not shown.
BIN
pyth/__pycache__/vectData.cpython-310.pyc
Normal file
BIN
pyth/__pycache__/vectData.cpython-310.pyc
Normal file
Binary file not shown.
141
pyth/requirements.txt
Normal file
141
pyth/requirements.txt
Normal file
@@ -0,0 +1,141 @@
|
|||||||
|
aiohttp==3.9.1
|
||||||
|
aiosignal==1.3.1
|
||||||
|
annotated-types==0.6.0
|
||||||
|
anyio==4.2.0
|
||||||
|
apturl==0.5.2
|
||||||
|
async-timeout==4.0.3
|
||||||
|
attrs==23.1.0
|
||||||
|
beautifulsoup4==4.12.2
|
||||||
|
blinker==1.7.0
|
||||||
|
blis==0.7.11
|
||||||
|
Brlapi==0.8.3
|
||||||
|
catalogue==2.0.10
|
||||||
|
certifi==2020.6.20
|
||||||
|
chardet==4.0.0
|
||||||
|
charset-normalizer==3.3.2
|
||||||
|
click==8.1.7
|
||||||
|
cloudpathlib==0.16.0
|
||||||
|
colorama==0.4.4
|
||||||
|
command-not-found==0.3
|
||||||
|
confection==0.1.4
|
||||||
|
cryptography==3.4.8
|
||||||
|
cupshelpers==1.0
|
||||||
|
cymem==2.0.8
|
||||||
|
dataclasses-json==0.6.3
|
||||||
|
DateTime==5.4
|
||||||
|
dbus-python==1.2.18
|
||||||
|
decorator==4.4.2
|
||||||
|
defer==1.0.6
|
||||||
|
distro==1.7.0
|
||||||
|
distro-info==1.1+ubuntu0.1
|
||||||
|
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
|
||||||
|
exceptiongroup==1.2.0
|
||||||
|
Flask==3.0.0
|
||||||
|
Flask-Cors==4.0.0
|
||||||
|
frozenlist==1.4.1
|
||||||
|
greenlet==1.1.2
|
||||||
|
gyp==0.1
|
||||||
|
h11==0.14.0
|
||||||
|
httpcore==1.0.2
|
||||||
|
httplib2==0.20.2
|
||||||
|
httpx==0.25.2
|
||||||
|
idna==3.3
|
||||||
|
importlib-metadata==4.6.4
|
||||||
|
itsdangerous==2.1.2
|
||||||
|
jeepney==0.7.1
|
||||||
|
Jinja2==3.1.2
|
||||||
|
joblib==1.3.2
|
||||||
|
jsonpatch==1.33
|
||||||
|
jsonpointer==2.4
|
||||||
|
keyring==23.5.0
|
||||||
|
langchain==0.0.352
|
||||||
|
langchain-community==0.0.6
|
||||||
|
langchain-core==0.1.3
|
||||||
|
langcodes==3.3.0
|
||||||
|
langsmith==0.0.74
|
||||||
|
language-selector==0.1
|
||||||
|
launchpadlib==1.10.16
|
||||||
|
lazr.restfulclient==0.14.4
|
||||||
|
lazr.uri==1.0.6
|
||||||
|
louis==3.20.0
|
||||||
|
macaroonbakery==1.3.1
|
||||||
|
MarkupSafe==2.1.3
|
||||||
|
marshmallow==3.20.1
|
||||||
|
more-itertools==8.10.0
|
||||||
|
multidict==6.0.4
|
||||||
|
murmurhash==1.0.10
|
||||||
|
mypy-extensions==1.0.0
|
||||||
|
netifaces==0.11.0
|
||||||
|
numpy==1.26.2
|
||||||
|
oauthlib==3.2.0
|
||||||
|
olefile==0.46
|
||||||
|
openai==1.5.0
|
||||||
|
packaging==23.2
|
||||||
|
pbr==5.8.0
|
||||||
|
pexpect==4.8.0
|
||||||
|
pgvector==0.2.4
|
||||||
|
Pillow==9.0.1
|
||||||
|
preshed==3.0.9
|
||||||
|
protobuf==3.12.4
|
||||||
|
psycopg==3.1.15
|
||||||
|
psycopg2-binary==2.9.9
|
||||||
|
ptyprocess==0.7.0
|
||||||
|
pycairo==1.20.1
|
||||||
|
pycups==2.0.1
|
||||||
|
pydantic==2.5.2
|
||||||
|
pydantic_core==2.14.5
|
||||||
|
PyGObject==3.42.1
|
||||||
|
PyJWT==2.3.0
|
||||||
|
pymacaroons==0.13.0
|
||||||
|
PyNaCl==1.5.0
|
||||||
|
pyparsing==2.4.7
|
||||||
|
pyRFC3339==1.1
|
||||||
|
python-apt==2.4.0+ubuntu2
|
||||||
|
python-dateutil==2.8.1
|
||||||
|
python-debian==0.1.43+ubuntu1.1
|
||||||
|
python-dotenv==1.0.0
|
||||||
|
pytz==2022.1
|
||||||
|
pyxdg==0.27
|
||||||
|
PyYAML==5.4.1
|
||||||
|
regex==2023.10.3
|
||||||
|
reportlab==3.6.8
|
||||||
|
requests==2.31.0
|
||||||
|
scikit-learn==1.3.2
|
||||||
|
scipy==1.11.4
|
||||||
|
SecretStorage==3.3.1
|
||||||
|
six==1.16.0
|
||||||
|
slugify==0.0.1
|
||||||
|
smart-open==6.4.0
|
||||||
|
sniffio==1.3.0
|
||||||
|
soupsieve==2.5
|
||||||
|
spacy==3.7.2
|
||||||
|
spacy-legacy==3.0.12
|
||||||
|
spacy-loggers==1.0.5
|
||||||
|
SQLAlchemy==1.4.31
|
||||||
|
sqlalchemy-migrate==0.13.0
|
||||||
|
sqlparse==0.4.2
|
||||||
|
srsly==2.4.8
|
||||||
|
systemd-python==234
|
||||||
|
Tempita==0.5.2
|
||||||
|
tenacity==8.2.3
|
||||||
|
thinc==8.2.2
|
||||||
|
threadpoolctl==3.2.0
|
||||||
|
tiktoken==0.5.2
|
||||||
|
tqdm==4.66.1
|
||||||
|
typer==0.9.0
|
||||||
|
typing-inspect==0.9.0
|
||||||
|
typing_extensions==4.9.0
|
||||||
|
ubuntu-advantage-tools==8001
|
||||||
|
ubuntu-drivers-common==0.0.0
|
||||||
|
ufw==0.36.1
|
||||||
|
unattended-upgrades==0.1
|
||||||
|
urllib3==1.26.5
|
||||||
|
wadllib==1.3.6
|
||||||
|
wasabi==1.1.2
|
||||||
|
weasel==0.3.4
|
||||||
|
Werkzeug==3.0.1
|
||||||
|
xdg==5
|
||||||
|
xkit==0.0.0
|
||||||
|
yarl==1.9.4
|
||||||
|
zipp==1.0.0
|
||||||
|
zope.interface==6.1
|
||||||
@@ -1,15 +1,20 @@
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import requests
|
import requests
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
from openai import OpenAI
|
from openai import OpenAI , APIError
|
||||||
import os
|
import os
|
||||||
from langchain.embeddings import OpenAIEmbeddings
|
from langchain.embeddings import OpenAIEmbeddings
|
||||||
from langchain.vectorstores.pgvector import PGVector
|
from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data, delete_specific,get_all_links,cleansing ,modify_similar_data)
|
||||||
from vectData import insert_data ,is_similar_data
|
|
||||||
import json
|
import json
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import tiktoken
|
||||||
|
|
||||||
|
|
||||||
os.environ["OPENAI_API_KEY"] = "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"
|
load_dotenv()
|
||||||
|
cleansing()
|
||||||
|
|
||||||
|
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
||||||
|
|
||||||
client = OpenAI()
|
client = OpenAI()
|
||||||
embeddings = OpenAIEmbeddings()
|
embeddings = OpenAIEmbeddings()
|
||||||
|
|
||||||
@@ -17,9 +22,36 @@ dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
|
|||||||
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
|
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:
|
||||||
|
encoding = tiktoken.encoding_for_model(model)
|
||||||
|
return len(encoding.encode(string))
|
||||||
|
|
||||||
|
def slice_text_at_2k_tokens(text):
|
||||||
|
encoding_name = "gpt-3.5-turbo"
|
||||||
|
max_tokens = 2000
|
||||||
|
|
||||||
|
encoding = tiktoken.encoding_for_model(encoding_name)
|
||||||
|
tokens = encoding.encode(text)
|
||||||
|
|
||||||
|
if len(tokens) <= max_tokens:
|
||||||
|
return [text]
|
||||||
|
|
||||||
|
sliced_tokens = tokens[:max_tokens]
|
||||||
|
sliced_text = encoding.decode(sliced_tokens)
|
||||||
|
|
||||||
|
return sliced_text
|
||||||
|
|
||||||
|
|
||||||
|
def replace_with_spaces(text):
|
||||||
|
allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 "
|
||||||
|
cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)
|
||||||
|
return cleaned_text
|
||||||
|
|
||||||
total_links = set()
|
total_links = set()
|
||||||
collected_news = set()
|
collected_news = set()
|
||||||
|
|
||||||
|
|
||||||
def get_article_links(url, already_checked):
|
def get_article_links(url, already_checked):
|
||||||
response = requests.get(url,headers)
|
response = requests.get(url,headers)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
@@ -36,6 +68,8 @@ def get_article_links(url, already_checked):
|
|||||||
already_checked.add(link_value)
|
already_checked.add(link_value)
|
||||||
return link_store
|
return link_store
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
already_checked = set()
|
already_checked = set()
|
||||||
|
|
||||||
for dlink in dlinks:
|
for dlink in dlinks:
|
||||||
@@ -44,8 +78,17 @@ for dlink in dlinks:
|
|||||||
total_links.update(temp_links)
|
total_links.update(temp_links)
|
||||||
|
|
||||||
final_links = {item for item in total_links if item}
|
final_links = {item for item in total_links if item}
|
||||||
|
i = 0
|
||||||
|
|
||||||
for link in final_links:
|
db_links = set(get_all_links())
|
||||||
|
new_links = final_links - db_links
|
||||||
|
final_links = new_links
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
for link in final_links:
|
||||||
response = requests.get(link,headers)
|
response = requests.get(link,headers)
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
@@ -54,6 +97,16 @@ for link in final_links:
|
|||||||
|
|
||||||
texts = soup.find_all(['p'])
|
texts = soup.find_all(['p'])
|
||||||
text_text = ' '.join([text.get_text(strip=True) for text in texts])
|
text_text = ' '.join([text.get_text(strip=True) for text in texts])
|
||||||
|
|
||||||
|
text_text = text_text
|
||||||
|
title_text = title_text
|
||||||
|
|
||||||
|
title_text = replace_with_spaces(title_text)
|
||||||
|
|
||||||
|
|
||||||
|
print(f"Tokens usage: {num_tokens_from_string(text_text, 'gpt-3.5-turbo')}")
|
||||||
|
text_text = slice_text_at_2k_tokens(text_text)
|
||||||
|
text_text = replace_with_spaces(str(text_text))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
completion = client.chat.completions.create(
|
completion = client.chat.completions.create(
|
||||||
@@ -65,23 +118,130 @@ for link in final_links:
|
|||||||
)
|
)
|
||||||
generated_text = completion.choices[0].message.content
|
generated_text = completion.choices[0].message.content
|
||||||
|
|
||||||
|
generated_text = generated_text
|
||||||
|
|
||||||
response_data = json.loads(generated_text)
|
response_data = json.loads(generated_text)
|
||||||
|
|
||||||
title = response_data["title"]
|
title = response_data["title"]
|
||||||
text = response_data["content"]
|
text = response_data["content"]
|
||||||
|
|
||||||
print("*********************************")
|
#print("*********************************")
|
||||||
print(f"Title: {title}")
|
#print(f"Title: {title}")
|
||||||
print("---------------------------------")
|
#print("---------------------------------")
|
||||||
print(f"Content : {text}")
|
#print(f"Content : {text}")
|
||||||
print("*********************************")
|
#print("*********************************")
|
||||||
|
|
||||||
|
|
||||||
vector = embeddings.embed_query(generated_text)
|
vector = embeddings.embed_query(generated_text)
|
||||||
|
|
||||||
if not is_similar_data(title, text, link, vector, threshold=0.9):
|
|
||||||
insert_data(title, text, link, vector)
|
|
||||||
|
|
||||||
|
if not is_similar_data(title, text, link, vector, threshold=0.98):
|
||||||
|
similar_d = "NO"
|
||||||
|
insert_data(title, text, link, vector,similar_d)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in completion: {e}")
|
print(f"Error in completion: {e}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
def comb_similar():
|
||||||
|
|
||||||
|
print("Checking similar")
|
||||||
|
similar_article = get_similar()
|
||||||
|
|
||||||
|
grouped_data = {}
|
||||||
|
|
||||||
|
|
||||||
|
for sa in similar_article:
|
||||||
|
if similar_article:
|
||||||
|
first_t = get_specific_data(sa[0])
|
||||||
|
second_t = get_specific_data(sa[1])
|
||||||
|
link_f = first_t[0][2]
|
||||||
|
link_s = second_t[0][2]
|
||||||
|
f_text = first_t[0][1]
|
||||||
|
s_text = second_t[0][1]
|
||||||
|
f_title = first_t[0][0]
|
||||||
|
s_title = second_t[0][0]
|
||||||
|
|
||||||
|
if f_title in grouped_data:
|
||||||
|
grouped_data[f_title].append((f_text, link_f))
|
||||||
|
else:
|
||||||
|
grouped_data[f_title] = [(f_text, link_f)]
|
||||||
|
|
||||||
|
if s_title in grouped_data:
|
||||||
|
grouped_data[s_title].append((s_text, link_s))
|
||||||
|
else:
|
||||||
|
grouped_data[s_title] = [(s_text, link_s)]
|
||||||
|
|
||||||
|
for title, tuples in grouped_data.items():
|
||||||
|
if len(tuples) == 3:
|
||||||
|
text1, link1 = tuples[0]
|
||||||
|
text2, link2 = tuples[1]
|
||||||
|
text3, link3 = tuples[2]
|
||||||
|
|
||||||
|
t1check = num_tokens_from_string(text1)
|
||||||
|
t2check = num_tokens_from_string(text2)
|
||||||
|
t3check = num_tokens_from_string(text3)
|
||||||
|
slice_if_more = t1check,t2check,t3check
|
||||||
|
if slice_if_more < 2000:
|
||||||
|
combined_text = f"{text1}{text2}{text3}"
|
||||||
|
combined_text = slice_text_at_2k_tokens(combined_text)
|
||||||
|
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with 'content' field"
|
||||||
|
link = f"{link1} {link2} {link3}"
|
||||||
|
|
||||||
|
else:
|
||||||
|
user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field."
|
||||||
|
link = f"{link1} {link2} {link3}"
|
||||||
|
|
||||||
|
else:
|
||||||
|
ftcheck = num_tokens_from_string(f_text)
|
||||||
|
stcheck = num_tokens_from_string(s_text)
|
||||||
|
fscomb = ftcheck + stcheck
|
||||||
|
if fscomb <2000:
|
||||||
|
combined_text = f"{f_text}{s_text}"
|
||||||
|
user_message = rf"Here is text {combined_text}, combined from 2 sources, filter text, and make news content, return as JSON only with 'content' field"
|
||||||
|
link = f"{link_f} {link_s}"
|
||||||
|
|
||||||
|
else:
|
||||||
|
user_message = rf"Here are 2 texts {f_text} and {s_text}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field."
|
||||||
|
link = f"{link_f} {link_s}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
completion = client.chat.completions.create(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
||||||
|
{"role": "user", "content": user_message}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
generated_text = completion.choices[0].message.content
|
||||||
|
generated_text = generated_text
|
||||||
|
|
||||||
|
if similar_article:
|
||||||
|
if f_title == s_title:
|
||||||
|
print(f_title)
|
||||||
|
modify_similar_data(first_t,"SOURCE")
|
||||||
|
similar_article.remove(sa)
|
||||||
|
print("Modified")
|
||||||
|
else:
|
||||||
|
print(f"Second: {s_title}")
|
||||||
|
modify_similar_data(first_t,"SOURCE")
|
||||||
|
modify_similar_data(second_t,"SOURCE")
|
||||||
|
similar_article.remove(sa)
|
||||||
|
print("Modified")
|
||||||
|
else:
|
||||||
|
print("Similar list is empty")
|
||||||
|
|
||||||
|
response_data = json.loads(generated_text)
|
||||||
|
title = f_title
|
||||||
|
text = response_data["content"]
|
||||||
|
|
||||||
|
vector = embeddings.embed_query(generated_text)
|
||||||
|
|
||||||
|
if not is_similar_data(title, text, link, vector, threshold=0.98):
|
||||||
|
similar_d = "NO"
|
||||||
|
insert_data(title, text, link, vector, similar_d)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error in completion: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
comb_similar()
|
||||||
23
pyth/templates/index.html
Normal file
23
pyth/templates/index.html
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Test Pyth</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div>
|
||||||
|
<article>
|
||||||
|
<h2>Test Title 1</h2>
|
||||||
|
<p>Test Text 1</p>
|
||||||
|
<a href="/article/one"> First</a>
|
||||||
|
</article>
|
||||||
|
<article>
|
||||||
|
<h2>Test Title 2</h2>
|
||||||
|
<p>Test Text 2</p>
|
||||||
|
<a href="/article/two">Second</a>
|
||||||
|
</article>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
12
pyth/templates/one.html
Normal file
12
pyth/templates/one.html
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Article</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h2>Test Title</h2>
|
||||||
|
<p>Test Text</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
12
pyth/templates/two.html
Normal file
12
pyth/templates/two.html
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Article</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h2>Test Title</h2>
|
||||||
|
<p>Test Text</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
BIN
pyth/tests/__pycache__/test_scrapingsingle.cpython-310.pyc
Normal file
BIN
pyth/tests/__pycache__/test_scrapingsingle.cpython-310.pyc
Normal file
Binary file not shown.
BIN
pyth/tests/__pycache__/test_vectData.cpython-310.pyc
Normal file
BIN
pyth/tests/__pycache__/test_vectData.cpython-310.pyc
Normal file
Binary file not shown.
60
pyth/tests/test_scrapingsingle.py
Normal file
60
pyth/tests/test_scrapingsingle.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
import unittest
|
||||||
|
from unittest.mock import patch
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from langchain.embeddings import OpenAIEmbeddings
|
||||||
|
from langchain.vectorstores.pgvector import PGVector
|
||||||
|
from openai import OpenAI
|
||||||
|
import json
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from scrapingsingle import get_article_links, insert_data, is_similar_data
|
||||||
|
import os
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
||||||
|
client = OpenAI()
|
||||||
|
embeddings = OpenAIEmbeddings()
|
||||||
|
|
||||||
|
|
||||||
|
already_checked = set()
|
||||||
|
total_links = set()
|
||||||
|
collected_news = set()
|
||||||
|
dlinks = 'http://127.0.0.1:5000/'
|
||||||
|
|
||||||
|
class TestIntegration(unittest.TestCase):
|
||||||
|
|
||||||
|
|
||||||
|
def test_integration(self):
|
||||||
|
link = get_article_links(dlinks,already_checked)
|
||||||
|
self.assertEqual(len(already_checked), 2)
|
||||||
|
|
||||||
|
for link in total_links:
|
||||||
|
response = requests.get(link)
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
|
titles = soup.find_all(['h2', 'h1', 'h3'])
|
||||||
|
title_text = ' '.join([title.get_text(strip=True) for title in titles])
|
||||||
|
|
||||||
|
texts = soup.find_all(['p'])
|
||||||
|
text_text = ' '.join([text.get_text(strip=True) for text in texts])
|
||||||
|
|
||||||
|
completion = client.chat.completions.create(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
||||||
|
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
generated_text = completion.choices[0].message.content
|
||||||
|
|
||||||
|
response_data = json.loads(generated_text)
|
||||||
|
title = response_data["title"]
|
||||||
|
text = response_data["content"]
|
||||||
|
|
||||||
|
vector = embeddings.embed_query(generated_text)
|
||||||
|
|
||||||
|
self.assertIn("Test Title", title)
|
||||||
|
self.assertIn("Test Text", text)
|
||||||
|
self.assertEqual(len(total_links), 2)
|
||||||
|
|
||||||
89
pyth/tests/test_vectData.py
Normal file
89
pyth/tests/test_vectData.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
import unittest
|
||||||
|
import numpy as np
|
||||||
|
import psycopg2
|
||||||
|
import os
|
||||||
|
from vectData import calculate_cosine_similarity, is_similar_data, insert_data, get_data, create_db
|
||||||
|
|
||||||
|
class TestIntegration(unittest.TestCase):
|
||||||
|
host = os.getenv("DB_HOST")
|
||||||
|
port = os.getenv("DB_PORT")
|
||||||
|
user = os.getenv("DB_USER")
|
||||||
|
password = os.getenv("DB_PASSWORD")
|
||||||
|
dbname = os.getenv("DB_NAME")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls):
|
||||||
|
cls.host = os.getenv("DB_HOST")
|
||||||
|
cls.port = os.getenv("DB_PORT")
|
||||||
|
cls.user = os.getenv("DB_USER")
|
||||||
|
cls.password = os.getenv("DB_PASSWORD")
|
||||||
|
cls.dbname = os.getenv("DB_NAME")
|
||||||
|
|
||||||
|
cls.conn = psycopg2.connect(
|
||||||
|
host=cls.host,
|
||||||
|
port=cls.port,
|
||||||
|
user=cls.user,
|
||||||
|
password=cls.password,
|
||||||
|
dbname=cls.dbname
|
||||||
|
)
|
||||||
|
create_db(cls.conn)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def tearDownClass(cls):
|
||||||
|
cls.conn.close()
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
if self.conn.closed:
|
||||||
|
self.conn = psycopg2.connect(
|
||||||
|
host=self.host,
|
||||||
|
port=self.port,
|
||||||
|
user=self.user,
|
||||||
|
password=self.password,
|
||||||
|
dbname=self.dbname
|
||||||
|
)
|
||||||
|
self.cursor = self.conn.cursor()
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
if not self.cursor.closed:
|
||||||
|
self.cursor.close()
|
||||||
|
|
||||||
|
if not self.conn.closed:
|
||||||
|
self.conn.close()
|
||||||
|
|
||||||
|
def test_insert_and_retrieve_data(self):
|
||||||
|
title = 'test_title'
|
||||||
|
text = 'test_text'
|
||||||
|
link = 'test_link'
|
||||||
|
embedding = np.arange(1, 1537)
|
||||||
|
|
||||||
|
insert_data(title, text, link, embedding)
|
||||||
|
|
||||||
|
data = get_data()
|
||||||
|
|
||||||
|
self.assertEqual(data, [(title, text, link)])
|
||||||
|
|
||||||
|
def test_is_similar_data_integration(self):
|
||||||
|
title = 'test_title'
|
||||||
|
text = 'test_text'
|
||||||
|
link = 'test_link'
|
||||||
|
embedding = np.arange(1, 1537)
|
||||||
|
|
||||||
|
insert_data(title, text, link, embedding)
|
||||||
|
|
||||||
|
result = is_similar_data(title, text, link, embedding)
|
||||||
|
self.assertTrue(result)
|
||||||
|
|
||||||
|
result = is_similar_data(title, text, link, embedding)
|
||||||
|
self.assertTrue(result)
|
||||||
|
|
||||||
|
result = is_similar_data(title, text, link, embedding)
|
||||||
|
self.assertTrue(result)
|
||||||
|
|
||||||
|
def test_create_db_integration(self):
|
||||||
|
cursor = self.conn.cursor()
|
||||||
|
cursor.execute("SELECT * FROM information_schema.tables WHERE table_name = 'vectorsvevijesti'")
|
||||||
|
table_exist = bool(cursor.fetchone())
|
||||||
|
self.assertTrue(table_exist)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
190
pyth/vectData.py
190
pyth/vectData.py
@@ -3,12 +3,26 @@ from psycopg2 import sql
|
|||||||
from pgvector.psycopg2 import register_vector
|
from pgvector.psycopg2 import register_vector
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from datetime import datetime ,timedelta
|
||||||
|
|
||||||
host = 'localhost'
|
|
||||||
port = '5432'
|
load_dotenv()
|
||||||
user = 'postgres'
|
|
||||||
password = 'salmonela pljusti 221 hamo'
|
host = os.getenv("DB_HOST")
|
||||||
dbname = 'vector_svw'
|
port = os.getenv("DB_PORT")
|
||||||
|
user = os.getenv("DB_USER")
|
||||||
|
password = os.getenv("DB_PASSWORD")
|
||||||
|
dbname = os.getenv("DB_NAME")
|
||||||
|
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=host,
|
||||||
|
port=port,
|
||||||
|
user=user,
|
||||||
|
password=password,
|
||||||
|
dbname=dbname
|
||||||
|
)
|
||||||
|
|
||||||
def calculate_cosine_similarity(v1, v2):
|
def calculate_cosine_similarity(v1, v2):
|
||||||
v1_normalized = v1 / np.linalg.norm(v1)
|
v1_normalized = v1 / np.linalg.norm(v1)
|
||||||
@@ -17,7 +31,7 @@ def calculate_cosine_similarity(v1, v2):
|
|||||||
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
|
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
|
||||||
return similarity
|
return similarity
|
||||||
|
|
||||||
def is_similar_data(title, text, link, embedding, threshold=0.9):
|
def is_similar_data(title, text, link, embedding, threshold=0.98):
|
||||||
conn = psycopg2.connect(
|
conn = psycopg2.connect(
|
||||||
host=host,
|
host=host,
|
||||||
port=port,
|
port=port,
|
||||||
@@ -27,25 +41,33 @@ def is_similar_data(title, text, link, embedding, threshold=0.9):
|
|||||||
)
|
)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute('SELECT title ,embedding FROM vectorsvevijesti;')
|
cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;')
|
||||||
existing_embeddings = cursor.fetchall()
|
existing_embeddings = cursor.fetchall()
|
||||||
|
|
||||||
for existing_embedding_tuple in existing_embeddings:
|
for existing_embedding_tuple in existing_embeddings:
|
||||||
existing_title = existing_embedding_tuple[0]
|
existing_title = existing_embedding_tuple[0]
|
||||||
existing_embedding = np.array(existing_embedding_tuple[1]).flatten()
|
existing_embedding = np.array(existing_embedding_tuple[1]).flatten()
|
||||||
|
existing_link = existing_embedding_tuple[2]
|
||||||
similarity = calculate_cosine_similarity(existing_embedding, embedding)
|
similarity = calculate_cosine_similarity(existing_embedding, embedding)
|
||||||
if similarity > threshold:
|
if similarity > threshold:
|
||||||
print(f"Similar data found: \n #{title} \n #{existing_title}")
|
if link != existing_link:
|
||||||
cursor.close()
|
similar_d = existing_title
|
||||||
conn.close()
|
insert_data(title,text,link,embedding,similar_d)
|
||||||
return True
|
print(f"Similar data found: \n #{title} \n #{existing_title}")
|
||||||
|
print(f"Inserting: #{title} \n")
|
||||||
|
similar_d = "NO"
|
||||||
|
cursor.close()
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"Same source of same article!")
|
||||||
|
cursor.close()
|
||||||
|
return True
|
||||||
|
|
||||||
print(f"Inserting: #{title}")
|
print(f"Inserting: #{title}")
|
||||||
cursor.close()
|
cursor.close()
|
||||||
conn.close()
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def insert_data(title, text, link, embedding):
|
def get_similar():
|
||||||
conn = psycopg2.connect(
|
conn = psycopg2.connect(
|
||||||
host=host,
|
host=host,
|
||||||
port=port,
|
port=port,
|
||||||
@@ -53,17 +75,35 @@ def insert_data(title, text, link, embedding):
|
|||||||
password=password,
|
password=password,
|
||||||
dbname=dbname
|
dbname=dbname
|
||||||
)
|
)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
|
||||||
|
cursor.execute(query)
|
||||||
|
similar_data = cursor.fetchall()
|
||||||
|
cursor.close()
|
||||||
|
return similar_data
|
||||||
|
|
||||||
|
|
||||||
|
def insert_data(title, text, link, embedding, similar_d):
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=host,
|
||||||
|
port=port,
|
||||||
|
user=user,
|
||||||
|
password=password,
|
||||||
|
dbname=dbname
|
||||||
|
)
|
||||||
|
c_time = datetime.now()
|
||||||
|
|
||||||
|
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
INSERT INTO vectorsvevijesti (title, text, link, embedding)
|
INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time)
|
||||||
VALUES (%s, %s, %s, %s);
|
VALUES (%s, %s, %s, %s, %s ,%s);
|
||||||
''', (title, text, link, embedding))
|
''', (title, text, link, embedding , similar_d, c_time))
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
cursor.close()
|
cursor.close()
|
||||||
conn.close()
|
|
||||||
|
|
||||||
def get_data():
|
def get_data():
|
||||||
conn = psycopg2.connect(
|
conn = psycopg2.connect(
|
||||||
@@ -79,11 +119,110 @@ def get_data():
|
|||||||
cursor.execute(query)
|
cursor.execute(query)
|
||||||
data = cursor.fetchall()
|
data = cursor.fetchall()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
conn.close()
|
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def create_db():
|
def modify_similar_data(new_value ,title):
|
||||||
|
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=host,
|
||||||
|
port=port,
|
||||||
|
user=user,
|
||||||
|
password=password,
|
||||||
|
dbname=dbname
|
||||||
|
)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s '''
|
||||||
|
|
||||||
|
cursor.execute(query, (new_value, title))
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
def get_specific_data(title):
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=host,
|
||||||
|
port=port,
|
||||||
|
user=user,
|
||||||
|
password=password,
|
||||||
|
dbname=dbname
|
||||||
|
)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
query = '''SELECT title, text, link, similar_d, embedding FROM vectorsvevijesti WHERE title = %s'''
|
||||||
|
cursor.execute(query, (title,))
|
||||||
|
|
||||||
|
specific_post = cursor.fetchall()
|
||||||
|
cursor.close()
|
||||||
|
return specific_post
|
||||||
|
|
||||||
|
def get_all_links():
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=host,
|
||||||
|
port=port,
|
||||||
|
user=user,
|
||||||
|
password=password,
|
||||||
|
dbname=dbname
|
||||||
|
)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
query = '''SELECT link FROM vectorsvevijesti'''
|
||||||
|
cursor.execute(query)
|
||||||
|
|
||||||
|
db_links = {link[0] for link in cursor.fetchall()}
|
||||||
|
cursor.close()
|
||||||
|
return db_links
|
||||||
|
|
||||||
|
def delete_specific(title):
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=host,
|
||||||
|
port=port,
|
||||||
|
user=user,
|
||||||
|
password=password,
|
||||||
|
dbname=dbname
|
||||||
|
)
|
||||||
|
|
||||||
|
cursor = conn.cursor()
|
||||||
|
query = '''DELETE FROM vectorsvevijesti WHERE title = %s'''
|
||||||
|
|
||||||
|
cursor.execute(query,(title,))
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
def cleansing():
|
||||||
|
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=host,
|
||||||
|
port=port,
|
||||||
|
user=user,
|
||||||
|
password=password,
|
||||||
|
dbname=dbname
|
||||||
|
)
|
||||||
|
|
||||||
|
day_long = datetime.now() - timedelta(days=1)
|
||||||
|
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
query = '''DELETE FROM vectorsvevijesti WHERE time < %s'''
|
||||||
|
cursor.execute(query,(day_long,))
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
def drop_table():
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=host,
|
||||||
|
port=port,
|
||||||
|
user=user,
|
||||||
|
password=password,
|
||||||
|
dbname=dbname
|
||||||
|
)
|
||||||
|
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
query = '''DROP TABLE IF EXISTS vectorsvevijesti;'''
|
||||||
|
cursor.execute(query)
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
def create_db(conn):
|
||||||
conn = psycopg2.connect(
|
conn = psycopg2.connect(
|
||||||
host=host,
|
host=host,
|
||||||
port=port,
|
port=port,
|
||||||
@@ -97,19 +236,18 @@ def create_db():
|
|||||||
|
|
||||||
register_vector(conn)
|
register_vector(conn)
|
||||||
|
|
||||||
cursor.execute("DROP TABLE IF EXISTS vectorsvevijesti;")
|
|
||||||
|
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
CREATE TABLE vectorsvevijesti (
|
CREATE TABLE IF NOT EXISTS vectorsvevijesti (
|
||||||
id bigserial PRIMARY KEY,
|
id bigserial PRIMARY KEY,
|
||||||
title VARCHAR,
|
title VARCHAR,
|
||||||
text VARCHAR,
|
text VARCHAR,
|
||||||
link VARCHAR,
|
link VARCHAR,
|
||||||
embedding vector(1536)
|
embedding vector(1536),
|
||||||
|
similar_d VARCHAR,
|
||||||
|
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
);
|
);
|
||||||
''')
|
''')
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
conn.close()
|
create_db(conn)
|
||||||
create_db()
|
|
||||||
|
|||||||
24
pyth/web-server.py
Normal file
24
pyth/web-server.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
from flask import Flask , render_template , jsonify
|
||||||
|
from vectData import get_data
|
||||||
|
from flask_cors import CORS
|
||||||
|
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
CORS(app)
|
||||||
|
|
||||||
|
@app.route('/')
|
||||||
|
def index() :
|
||||||
|
return render_template("index.html")
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/article/one')
|
||||||
|
def articleone():
|
||||||
|
return render_template("one.html")
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/article/two')
|
||||||
|
def articletwo():
|
||||||
|
return render_template("two.html")
|
||||||
|
|
||||||
|
app.run(debug=True)
|
||||||
Reference in New Issue
Block a user