Merge branch 'svevijesti-cs' into 'master'
Combine similar article See merge request kbr4/svevijesti!5
This commit was merged in pull request #5.
This commit is contained in:
7
pyth/.env
Normal file
7
pyth/.env
Normal file
@@ -0,0 +1,7 @@
|
||||
OPENAI_API_KEY = "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"
|
||||
|
||||
DB_HOST =localhost
|
||||
DB_PORT =5432
|
||||
DB_USER =postgres
|
||||
DB_PASSWORD =salmonela pljusti 221 hamo
|
||||
DB_NAME =svevijestiweb
|
||||
21
pyth/.gitlab-ci.yml
Normal file
21
pyth/.gitlab-ci.yml
Normal file
@@ -0,0 +1,21 @@
|
||||
stages:
|
||||
- test
|
||||
|
||||
variables:
|
||||
|
||||
before_script:
|
||||
- pip install -r requirements.txt
|
||||
|
||||
test_file1:
|
||||
stage: test
|
||||
script:
|
||||
- python -m pytest tests/test_scrapingsingle.py
|
||||
only:
|
||||
- master
|
||||
|
||||
test_file2:
|
||||
stage: test
|
||||
script:
|
||||
- python -m pytest tests/test_vectData.py
|
||||
only:
|
||||
- master
|
||||
BIN
pyth/__pycache__/articles.cpython-310.pyc
Normal file
BIN
pyth/__pycache__/articles.cpython-310.pyc
Normal file
Binary file not shown.
BIN
pyth/__pycache__/scrapingsingle.cpython-310.pyc
Normal file
BIN
pyth/__pycache__/scrapingsingle.cpython-310.pyc
Normal file
Binary file not shown.
BIN
pyth/__pycache__/vectData.cpython-310.pyc
Normal file
BIN
pyth/__pycache__/vectData.cpython-310.pyc
Normal file
Binary file not shown.
241
pyth/articles.py
Normal file
241
pyth/articles.py
Normal file
@@ -0,0 +1,241 @@
|
||||
import psycopg2
|
||||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
from openai import OpenAI
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings
|
||||
from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens
|
||||
import json
|
||||
from json_repair import repair_json
|
||||
|
||||
load_dotenv()
|
||||
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
||||
client = OpenAI()
|
||||
embeddings = OpenAIEmbeddings()
|
||||
|
||||
print(f"Checking for similar!")
|
||||
|
||||
def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
|
||||
try:
|
||||
titles, links, embeddings = get_titles_links_embeddings()
|
||||
|
||||
processed_articles = set()
|
||||
grouped_similar_articles = []
|
||||
|
||||
for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
|
||||
if (title1, link1) not in processed_articles:
|
||||
processed_articles.add((title1, link1))
|
||||
group = [(title1, link1)]
|
||||
|
||||
for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
|
||||
if i != j and (title2, link2) not in processed_articles:
|
||||
similarity = calculate_cosine_similarity(embedding1, embedding2)
|
||||
|
||||
if similarity > threshold:
|
||||
processed_articles.add((title2, link2))
|
||||
group.append((title2, link2))
|
||||
|
||||
grouped_similar_articles.append(group)
|
||||
|
||||
return grouped_similar_articles
|
||||
|
||||
except psycopg2.Error as e:
|
||||
print(f"Error: {e}")
|
||||
return []
|
||||
|
||||
def processing_similar():
|
||||
grouped_similar_articles_result = find_and_group_similar_articles()
|
||||
|
||||
if grouped_similar_articles_result:
|
||||
for group in grouped_similar_articles_result:
|
||||
articles = []
|
||||
|
||||
if len(group) > 1:
|
||||
for article_tuple in group:
|
||||
if len(article_tuple) >= 2:
|
||||
title, link = article_tuple[:2]
|
||||
article = [title, link]
|
||||
articles.append(article)
|
||||
l = len(articles)
|
||||
|
||||
if l == 2:
|
||||
a_one = articles[0][0]
|
||||
a_two = articles[1][0]
|
||||
|
||||
get_one = get_specific_data(a_one)
|
||||
get_two = get_specific_data(a_two)
|
||||
|
||||
text1 = get_one[0][1]
|
||||
text2 = get_two[0][1]
|
||||
link1 = get_one[0][2]
|
||||
link2 = get_two[0][2]
|
||||
if link1 != link2:
|
||||
link = f"{link1}, {link2}"
|
||||
else:
|
||||
link = link1
|
||||
|
||||
ftoks = num_tokens_from_string(text1)
|
||||
stoks = num_tokens_from_string(text2)
|
||||
tokens = ftoks + stoks
|
||||
|
||||
similar_d = f"C: {a_one}, {a_two}"
|
||||
|
||||
modify_similar_data(similar_d, a_one)
|
||||
preparing_articles(False, a_one)
|
||||
|
||||
modify_similar_data(similar_d, a_two)
|
||||
preparing_articles(False, a_two)
|
||||
|
||||
if tokens > 2000:
|
||||
combined_text = f"{text1} {text2}"
|
||||
combined_text = slice_text_at_2k_tokens(combined_text)
|
||||
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
|
||||
else:
|
||||
user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
|
||||
|
||||
if l == 3:
|
||||
a_one = articles[0][0]
|
||||
a_two = articles[1][0]
|
||||
a_three = articles[2][0]
|
||||
|
||||
get_one = get_specific_data(a_one)
|
||||
get_two = get_specific_data(a_two)
|
||||
get_three = get_specific_data(a_three)
|
||||
|
||||
text1 = get_one[0][1]
|
||||
text2 = get_two[0][1]
|
||||
text3 = get_three[0][1]
|
||||
link1 = get_one[0][2]
|
||||
link2 = get_two[0][2]
|
||||
link3 = get_three[0][2]
|
||||
if link1 != link2:
|
||||
if link2 != link3:
|
||||
link = f"{link1}, {link2}, {link3}"
|
||||
else:
|
||||
link = f"{link1}, {link2}"
|
||||
else:
|
||||
if link2 != link3:
|
||||
link = f"{link1}, {link3}"
|
||||
else:
|
||||
link = link1
|
||||
ftoks = num_tokens_from_string(text1)
|
||||
stoks = num_tokens_from_string(text2)
|
||||
ttoks = num_tokens_from_string(text3)
|
||||
tokens = ftoks + stoks + ttoks
|
||||
|
||||
similar_d = f"C: {a_one}, {a_two}, {a_three}"
|
||||
modify_similar_data(similar_d, a_one)
|
||||
preparing_articles(False, a_one)
|
||||
|
||||
modify_similar_data(similar_d, a_two)
|
||||
preparing_articles(False, a_two)
|
||||
|
||||
modify_similar_data(similar_d, a_three)
|
||||
preparing_articles(False, a_three)
|
||||
|
||||
if tokens > 2000:
|
||||
combined_text = f"{text1} {text2} {text3}"
|
||||
combined_text = slice_text_at_2k_tokens(combined_text)
|
||||
user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
|
||||
else:
|
||||
user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
|
||||
if l == 4:
|
||||
a_one = articles[0][0]
|
||||
a_two = articles[1][0]
|
||||
a_three = articles[2][0]
|
||||
a_four = articles[3][0]
|
||||
|
||||
get_one = get_specific_data(a_one)
|
||||
get_two = get_specific_data(a_two)
|
||||
get_three = get_specific_data(a_three)
|
||||
get_four = get_specific_data(a_four)
|
||||
|
||||
text1 = get_one[0][1]
|
||||
text2 = get_two[0][1]
|
||||
text3 = get_three[0][1]
|
||||
text4 = get_four[0][1]
|
||||
link1 = get_one[0][2]
|
||||
link2 = get_two[0][2]
|
||||
link3 = get_three[0][2]
|
||||
link4 = get_four[0][2]
|
||||
|
||||
if link1 != link2:
|
||||
if link2 != link3:
|
||||
if link3 != link4:
|
||||
link = f"{link1}, {link2}, {link3}, {link4}"
|
||||
else:
|
||||
link = f"{link1}, {link2}, {link3}"
|
||||
else:
|
||||
if link3 != link4:
|
||||
link = f"{link1}, {link2}, {link4}"
|
||||
else:
|
||||
link = f"{link1}, {link2}"
|
||||
else:
|
||||
if link2 != link3:
|
||||
if link3 != link4:
|
||||
link = f"{link1}, {link3}, {link4}"
|
||||
else:
|
||||
link = f"{link1}, {link3}"
|
||||
else:
|
||||
if link3 != link4:
|
||||
link = f"{link1}, {link4}"
|
||||
else:
|
||||
link = link1
|
||||
|
||||
ftoks = num_tokens_from_string(text1)
|
||||
stoks = num_tokens_from_string(text2)
|
||||
ttoks = num_tokens_from_string(text3)
|
||||
frtoks = num_tokens_from_string(text4)
|
||||
|
||||
tokens = ftoks + stoks + ttoks + frtoks
|
||||
|
||||
similar_d = f"C: {a_one}, {a_two}, {a_three}, {a_four}"
|
||||
modify_similar_data(similar_d, a_one)
|
||||
preparing_articles(False, a_one)
|
||||
|
||||
modify_similar_data(similar_d, a_two)
|
||||
preparing_articles(False, a_two)
|
||||
|
||||
modify_similar_data(similar_d, a_three)
|
||||
preparing_articles(False, a_three)
|
||||
|
||||
modify_similar_data(similar_d, a_four)
|
||||
preparing_articles(False, a_four)
|
||||
|
||||
if tokens > 2000:
|
||||
combined_text = f"{text1} {text2} {text3} {text4}"
|
||||
combined_text = slice_text_at_2k_tokens(combined_text)
|
||||
user_message = rf"Here is text {combined_text}, combined from 4 sources, filter text, and make news content, return as JSON only with a single 'content' field"
|
||||
else:
|
||||
user_message = rf"Here are 4 texts {text1} {text2} {text3} and {text4}, combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single 'content' field."
|
||||
try:
|
||||
completion = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
||||
{"role": "user", "content": user_message}
|
||||
])
|
||||
generated_text = completion.choices[0].message.content
|
||||
|
||||
generated_text = repair_json(generated_text)
|
||||
|
||||
response_data = json.loads(generated_text)
|
||||
title = a_one
|
||||
text = response_data["content"]
|
||||
vector = embeddings.embed_query(generated_text)
|
||||
|
||||
insert_data(title, text, link, vector, similar_d)
|
||||
print(f"Inserting combined: {title}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
print(a_one)
|
||||
continue
|
||||
else:
|
||||
print("Done!.")
|
||||
else:
|
||||
print("No similar articles found.")
|
||||
if __name__=="__main__":
|
||||
processing_similar()
|
||||
141
pyth/requirements.txt
Normal file
141
pyth/requirements.txt
Normal file
@@ -0,0 +1,141 @@
|
||||
aiohttp==3.9.1
|
||||
aiosignal==1.3.1
|
||||
annotated-types==0.6.0
|
||||
anyio==4.2.0
|
||||
apturl==0.5.2
|
||||
async-timeout==4.0.3
|
||||
attrs==23.1.0
|
||||
beautifulsoup4==4.12.2
|
||||
blinker==1.7.0
|
||||
blis==0.7.11
|
||||
Brlapi==0.8.3
|
||||
catalogue==2.0.10
|
||||
certifi==2020.6.20
|
||||
chardet==4.0.0
|
||||
charset-normalizer==3.3.2
|
||||
click==8.1.7
|
||||
cloudpathlib==0.16.0
|
||||
colorama==0.4.4
|
||||
command-not-found==0.3
|
||||
confection==0.1.4
|
||||
cryptography==3.4.8
|
||||
cupshelpers==1.0
|
||||
cymem==2.0.8
|
||||
dataclasses-json==0.6.3
|
||||
DateTime==5.4
|
||||
dbus-python==1.2.18
|
||||
decorator==4.4.2
|
||||
defer==1.0.6
|
||||
distro==1.7.0
|
||||
distro-info==1.1+ubuntu0.1
|
||||
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
|
||||
exceptiongroup==1.2.0
|
||||
Flask==3.0.0
|
||||
Flask-Cors==4.0.0
|
||||
frozenlist==1.4.1
|
||||
greenlet==1.1.2
|
||||
gyp==0.1
|
||||
h11==0.14.0
|
||||
httpcore==1.0.2
|
||||
httplib2==0.20.2
|
||||
httpx==0.25.2
|
||||
idna==3.3
|
||||
importlib-metadata==4.6.4
|
||||
itsdangerous==2.1.2
|
||||
jeepney==0.7.1
|
||||
Jinja2==3.1.2
|
||||
joblib==1.3.2
|
||||
jsonpatch==1.33
|
||||
jsonpointer==2.4
|
||||
keyring==23.5.0
|
||||
langchain==0.0.352
|
||||
langchain-community==0.0.6
|
||||
langchain-core==0.1.3
|
||||
langcodes==3.3.0
|
||||
langsmith==0.0.74
|
||||
language-selector==0.1
|
||||
launchpadlib==1.10.16
|
||||
lazr.restfulclient==0.14.4
|
||||
lazr.uri==1.0.6
|
||||
louis==3.20.0
|
||||
macaroonbakery==1.3.1
|
||||
MarkupSafe==2.1.3
|
||||
marshmallow==3.20.1
|
||||
more-itertools==8.10.0
|
||||
multidict==6.0.4
|
||||
murmurhash==1.0.10
|
||||
mypy-extensions==1.0.0
|
||||
netifaces==0.11.0
|
||||
numpy==1.26.2
|
||||
oauthlib==3.2.0
|
||||
olefile==0.46
|
||||
openai==1.5.0
|
||||
packaging==23.2
|
||||
pbr==5.8.0
|
||||
pexpect==4.8.0
|
||||
pgvector==0.2.4
|
||||
Pillow==9.0.1
|
||||
preshed==3.0.9
|
||||
protobuf==3.12.4
|
||||
psycopg==3.1.15
|
||||
psycopg2-binary==2.9.9
|
||||
ptyprocess==0.7.0
|
||||
pycairo==1.20.1
|
||||
pycups==2.0.1
|
||||
pydantic==2.5.2
|
||||
pydantic_core==2.14.5
|
||||
PyGObject==3.42.1
|
||||
PyJWT==2.3.0
|
||||
pymacaroons==0.13.0
|
||||
PyNaCl==1.5.0
|
||||
pyparsing==2.4.7
|
||||
pyRFC3339==1.1
|
||||
python-apt==2.4.0+ubuntu2
|
||||
python-dateutil==2.8.1
|
||||
python-debian==0.1.43+ubuntu1.1
|
||||
python-dotenv==1.0.0
|
||||
pytz==2022.1
|
||||
pyxdg==0.27
|
||||
PyYAML==5.4.1
|
||||
regex==2023.10.3
|
||||
reportlab==3.6.8
|
||||
requests==2.31.0
|
||||
scikit-learn==1.3.2
|
||||
scipy==1.11.4
|
||||
SecretStorage==3.3.1
|
||||
six==1.16.0
|
||||
slugify==0.0.1
|
||||
smart-open==6.4.0
|
||||
sniffio==1.3.0
|
||||
soupsieve==2.5
|
||||
spacy==3.7.2
|
||||
spacy-legacy==3.0.12
|
||||
spacy-loggers==1.0.5
|
||||
SQLAlchemy==1.4.31
|
||||
sqlalchemy-migrate==0.13.0
|
||||
sqlparse==0.4.2
|
||||
srsly==2.4.8
|
||||
systemd-python==234
|
||||
Tempita==0.5.2
|
||||
tenacity==8.2.3
|
||||
thinc==8.2.2
|
||||
threadpoolctl==3.2.0
|
||||
tiktoken==0.5.2
|
||||
tqdm==4.66.1
|
||||
typer==0.9.0
|
||||
typing-inspect==0.9.0
|
||||
typing_extensions==4.9.0
|
||||
ubuntu-advantage-tools==8001
|
||||
ubuntu-drivers-common==0.0.0
|
||||
ufw==0.36.1
|
||||
unattended-upgrades==0.1
|
||||
urllib3==1.26.5
|
||||
wadllib==1.3.6
|
||||
wasabi==1.1.2
|
||||
weasel==0.3.4
|
||||
Werkzeug==3.0.1
|
||||
xdg==5
|
||||
xkit==0.0.0
|
||||
yarl==1.9.4
|
||||
zipp==1.0.0
|
||||
zope.interface==6.1
|
||||
@@ -1,21 +1,66 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from urllib.parse import urljoin
|
||||
from openai import OpenAI
|
||||
from openai import OpenAI
|
||||
import os
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from langchain.vectorstores.pgvector import PGVector
|
||||
from vectData import insert_data ,is_similar_data
|
||||
from vectData import (insert_data ,is_similar_data ,get_all_links,cleansing )
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
import tiktoken
|
||||
from json_repair import repair_json
|
||||
|
||||
load_dotenv()
|
||||
cleansing()
|
||||
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
||||
|
||||
os.environ["OPENAI_API_KEY"] = "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"
|
||||
client = OpenAI()
|
||||
embeddings = OpenAIEmbeddings()
|
||||
|
||||
dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
|
||||
|
||||
def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:
|
||||
encoding = tiktoken.encoding_for_model(model)
|
||||
return len(encoding.encode(string))
|
||||
|
||||
def slice_text_at_2k_tokens(text):
|
||||
encoding_name = "gpt-3.5-turbo"
|
||||
max_tokens = 1950
|
||||
encoding = tiktoken.encoding_for_model(encoding_name)
|
||||
tokens = encoding.encode(text)
|
||||
if len(tokens) <= max_tokens:
|
||||
return [text]
|
||||
sliced_tokens = tokens[:max_tokens]
|
||||
sliced_text = encoding.decode(sliced_tokens)
|
||||
return sliced_text
|
||||
|
||||
def slice_title_if_needed(text):
|
||||
encoding_name = "gpt-3.5-turbo"
|
||||
max_tokens = 100
|
||||
encoding = tiktoken.encoding_for_model(encoding_name)
|
||||
tokens = encoding.encode(text)
|
||||
if len(tokens) <= max_tokens:
|
||||
return [text]
|
||||
sliced_tokens = tokens[:max_tokens]
|
||||
sliced_text = encoding.decode(sliced_tokens)
|
||||
return sliced_text
|
||||
|
||||
def replace_with_spaces(text):
|
||||
allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 "
|
||||
cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)
|
||||
return cleaned_text
|
||||
|
||||
def fix_links(links_set):
|
||||
modified_links = set()
|
||||
for link in links_set:
|
||||
if "www" in link:
|
||||
modified_link = link.replace("www.", "")
|
||||
modified_links.add(modified_link)
|
||||
else:
|
||||
modified_links.add(link)
|
||||
return modified_links
|
||||
|
||||
total_links = set()
|
||||
collected_news = set()
|
||||
@@ -42,10 +87,17 @@ for dlink in dlinks:
|
||||
temp_links = get_article_links(dlink, already_checked)
|
||||
if temp_links:
|
||||
total_links.update(temp_links)
|
||||
|
||||
final_links = {item for item in total_links if item}
|
||||
|
||||
for link in final_links:
|
||||
db_links = set(get_all_links())
|
||||
new_links = final_links - db_links
|
||||
final_links = new_links
|
||||
final_links = set(final_links)
|
||||
final_links = fix_links(final_links)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
for link in final_links:
|
||||
response = requests.get(link,headers)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
@@ -54,34 +106,41 @@ for link in final_links:
|
||||
|
||||
texts = soup.find_all(['p'])
|
||||
text_text = ' '.join([text.get_text(strip=True) for text in texts])
|
||||
|
||||
text_text = text_text
|
||||
title_text = title_text
|
||||
|
||||
title_text = replace_with_spaces(title_text)
|
||||
|
||||
text_text = slice_text_at_2k_tokens(text_text)
|
||||
text_text = replace_with_spaces(str(text_text))
|
||||
|
||||
ttk = num_tokens_from_string(text_text)
|
||||
|
||||
if ttk > 1900:
|
||||
title_text = slice_title_if_needed(title_text)
|
||||
try:
|
||||
completion = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
||||
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
|
||||
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data make sure that its valid JSON object with 'title' field and 'content' field."}
|
||||
]
|
||||
)
|
||||
generated_text = completion.choices[0].message.content
|
||||
|
||||
generated_text = repair_json(generated_text)
|
||||
|
||||
response_data = json.loads(generated_text)
|
||||
|
||||
title = response_data["title"]
|
||||
text = response_data["content"]
|
||||
|
||||
print("*********************************")
|
||||
print(f"Title: {title}")
|
||||
print("---------------------------------")
|
||||
print(f"Content : {text}")
|
||||
print("*********************************")
|
||||
|
||||
|
||||
vector = embeddings.embed_query(generated_text)
|
||||
|
||||
if not is_similar_data(title, text, link, vector, threshold=0.9):
|
||||
insert_data(title, text, link, vector)
|
||||
|
||||
if not is_similar_data(title, text, link, vector, threshold=0.98):
|
||||
similar_d = "NO"
|
||||
insert_data(title, text, link, vector,similar_d)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in completion: {e}")
|
||||
continue
|
||||
|
||||
|
||||
22
pyth/templates/index.html
Normal file
22
pyth/templates/index.html
Normal file
@@ -0,0 +1,22 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Test Pyth</title>
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<article>
|
||||
<h2>Test Title 1</h2>
|
||||
<p>Test Text 1</p>
|
||||
<a href="/article/one"> First</a>
|
||||
</article>
|
||||
<article>
|
||||
<h2>Test Title 2</h2>
|
||||
<p>Test Text 2</p>
|
||||
<a href="/article/two">Second</a>
|
||||
</article>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
12
pyth/templates/one.html
Normal file
12
pyth/templates/one.html
Normal file
@@ -0,0 +1,12 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Article</title>
|
||||
</head>
|
||||
<body>
|
||||
<h2>Test Title</h2>
|
||||
<p>Test Text</p>
|
||||
</body>
|
||||
</html>
|
||||
12
pyth/templates/two.html
Normal file
12
pyth/templates/two.html
Normal file
@@ -0,0 +1,12 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Article</title>
|
||||
</head>
|
||||
<body>
|
||||
<h2>Test Title</h2>
|
||||
<p>Test Text</p>
|
||||
</body>
|
||||
</html>
|
||||
BIN
pyth/tests/__pycache__/test_scrapingsingle.cpython-310.pyc
Normal file
BIN
pyth/tests/__pycache__/test_scrapingsingle.cpython-310.pyc
Normal file
Binary file not shown.
BIN
pyth/tests/__pycache__/test_vectData.cpython-310.pyc
Normal file
BIN
pyth/tests/__pycache__/test_vectData.cpython-310.pyc
Normal file
Binary file not shown.
60
pyth/tests/test_scrapingsingle.py
Normal file
60
pyth/tests/test_scrapingsingle.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from langchain.vectorstores.pgvector import PGVector
|
||||
from openai import OpenAI
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
from scrapingsingle import get_article_links, insert_data, is_similar_data
|
||||
import os
|
||||
|
||||
load_dotenv()
|
||||
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
||||
client = OpenAI()
|
||||
embeddings = OpenAIEmbeddings()
|
||||
|
||||
|
||||
already_checked = set()
|
||||
total_links = set()
|
||||
collected_news = set()
|
||||
dlinks = 'http://127.0.0.1:5000/'
|
||||
|
||||
class TestIntegration(unittest.TestCase):
|
||||
|
||||
|
||||
def test_integration(self):
|
||||
link = get_article_links(dlinks,already_checked)
|
||||
self.assertEqual(len(already_checked), 2)
|
||||
|
||||
for link in total_links:
|
||||
response = requests.get(link)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
titles = soup.find_all(['h2', 'h1', 'h3'])
|
||||
title_text = ' '.join([title.get_text(strip=True) for title in titles])
|
||||
|
||||
texts = soup.find_all(['p'])
|
||||
text_text = ' '.join([text.get_text(strip=True) for text in texts])
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
|
||||
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
|
||||
]
|
||||
)
|
||||
generated_text = completion.choices[0].message.content
|
||||
|
||||
response_data = json.loads(generated_text)
|
||||
title = response_data["title"]
|
||||
text = response_data["content"]
|
||||
|
||||
vector = embeddings.embed_query(generated_text)
|
||||
|
||||
self.assertIn("Test Title", title)
|
||||
self.assertIn("Test Text", text)
|
||||
self.assertEqual(len(total_links), 2)
|
||||
|
||||
89
pyth/tests/test_vectData.py
Normal file
89
pyth/tests/test_vectData.py
Normal file
@@ -0,0 +1,89 @@
|
||||
import unittest
|
||||
import numpy as np
|
||||
import psycopg2
|
||||
import os
|
||||
from vectData import calculate_cosine_similarity, is_similar_data, insert_data, get_data, create_db
|
||||
|
||||
class TestIntegration(unittest.TestCase):
|
||||
host = os.getenv("DB_HOST")
|
||||
port = os.getenv("DB_PORT")
|
||||
user = os.getenv("DB_USER")
|
||||
password = os.getenv("DB_PASSWORD")
|
||||
dbname = os.getenv("DB_NAME")
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.host = os.getenv("DB_HOST")
|
||||
cls.port = os.getenv("DB_PORT")
|
||||
cls.user = os.getenv("DB_USER")
|
||||
cls.password = os.getenv("DB_PASSWORD")
|
||||
cls.dbname = os.getenv("DB_NAME")
|
||||
|
||||
cls.conn = psycopg2.connect(
|
||||
host=cls.host,
|
||||
port=cls.port,
|
||||
user=cls.user,
|
||||
password=cls.password,
|
||||
dbname=cls.dbname
|
||||
)
|
||||
create_db(cls.conn)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
cls.conn.close()
|
||||
|
||||
def setUp(self):
|
||||
if self.conn.closed:
|
||||
self.conn = psycopg2.connect(
|
||||
host=self.host,
|
||||
port=self.port,
|
||||
user=self.user,
|
||||
password=self.password,
|
||||
dbname=self.dbname
|
||||
)
|
||||
self.cursor = self.conn.cursor()
|
||||
|
||||
def tearDown(self):
|
||||
if not self.cursor.closed:
|
||||
self.cursor.close()
|
||||
|
||||
if not self.conn.closed:
|
||||
self.conn.close()
|
||||
|
||||
def test_insert_and_retrieve_data(self):
|
||||
title = 'test_title'
|
||||
text = 'test_text'
|
||||
link = 'test_link'
|
||||
embedding = np.arange(1, 1537)
|
||||
|
||||
insert_data(title, text, link, embedding)
|
||||
|
||||
data = get_data()
|
||||
|
||||
self.assertEqual(data, [(title, text, link)])
|
||||
|
||||
def test_is_similar_data_integration(self):
|
||||
title = 'test_title'
|
||||
text = 'test_text'
|
||||
link = 'test_link'
|
||||
embedding = np.arange(1, 1537)
|
||||
|
||||
insert_data(title, text, link, embedding)
|
||||
|
||||
result = is_similar_data(title, text, link, embedding)
|
||||
self.assertTrue(result)
|
||||
|
||||
result = is_similar_data(title, text, link, embedding)
|
||||
self.assertTrue(result)
|
||||
|
||||
result = is_similar_data(title, text, link, embedding)
|
||||
self.assertTrue(result)
|
||||
|
||||
def test_create_db_integration(self):
|
||||
cursor = self.conn.cursor()
|
||||
cursor.execute("SELECT * FROM information_schema.tables WHERE table_name = 'vectorsvevijesti'")
|
||||
table_exist = bool(cursor.fetchone())
|
||||
self.assertTrue(table_exist)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
208
pyth/vectData.py
208
pyth/vectData.py
@@ -3,113 +3,193 @@ from psycopg2 import sql
|
||||
from pgvector.psycopg2 import register_vector
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
import numpy as np
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from datetime import datetime ,timedelta
|
||||
|
||||
host = 'localhost'
|
||||
port = '5432'
|
||||
user = 'postgres'
|
||||
password = 'salmonela pljusti 221 hamo'
|
||||
dbname = 'vector_svw'
|
||||
load_dotenv()
|
||||
|
||||
def calculate_cosine_similarity(v1, v2):
|
||||
v1_normalized = v1 / np.linalg.norm(v1)
|
||||
v2_normalized = v2 / np.linalg.norm(v2)
|
||||
host = os.getenv("DB_HOST")
|
||||
port = os.getenv("DB_PORT")
|
||||
user = os.getenv("DB_USER")
|
||||
password = os.getenv("DB_PASSWORD")
|
||||
dbname = os.getenv("DB_NAME")
|
||||
|
||||
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
|
||||
return similarity
|
||||
|
||||
def is_similar_data(title, text, link, embedding, threshold=0.9):
|
||||
conn = psycopg2.connect(
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('SELECT title ,embedding FROM vectorsvevijesti;')
|
||||
def calculate_cosine_similarity(v1, v2):
|
||||
v1_normalized = v1 / np.linalg.norm(v1)
|
||||
v2_normalized = v2 / np.linalg.norm(v2)
|
||||
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
|
||||
return similarity
|
||||
|
||||
def parse_embedding_string(embedding_str):
|
||||
if isinstance(embedding_str, str):
|
||||
numbers = [float(num) for num in embedding_str[1:-1].split(',')]
|
||||
return np.array(numbers)
|
||||
elif isinstance(embedding_str, np.ndarray):
|
||||
return embedding_str
|
||||
else:
|
||||
raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.")
|
||||
|
||||
def is_similar_data(title, text, link, embedding, threshold=0.98):
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;')
|
||||
existing_embeddings = cursor.fetchall()
|
||||
|
||||
for existing_embedding_tuple in existing_embeddings:
|
||||
existing_title = existing_embedding_tuple[0]
|
||||
existing_embedding = np.array(existing_embedding_tuple[1]).flatten()
|
||||
existing_link = existing_embedding_tuple[2]
|
||||
similarity = calculate_cosine_similarity(existing_embedding, embedding)
|
||||
if similarity > threshold:
|
||||
print(f"Similar data found: \n #{title} \n #{existing_title}")
|
||||
cursor.close()
|
||||
conn.close()
|
||||
return True
|
||||
if link != existing_link:
|
||||
similar_d = existing_title
|
||||
insert_data(title,text,link,embedding,similar_d)
|
||||
print(f"Similar data found: \n #{title} \n #{existing_title}")
|
||||
print(f"Inserting: #{title}")
|
||||
similar_d = "NO"
|
||||
cursor.close()
|
||||
return True
|
||||
else:
|
||||
print(f"Same article of same source!")
|
||||
cursor.close()
|
||||
return True
|
||||
|
||||
print(f"Inserting: #{title}")
|
||||
cursor.close()
|
||||
conn.close()
|
||||
return False
|
||||
|
||||
def insert_data(title, text, link, embedding):
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
def get_similar():
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
INSERT INTO vectorsvevijesti (title, text, link, embedding)
|
||||
VALUES (%s, %s, %s, %s);
|
||||
''', (title, text, link, embedding))
|
||||
|
||||
conn.commit()
|
||||
|
||||
query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
|
||||
cursor.execute(query)
|
||||
similar_data = cursor.fetchall()
|
||||
cursor.close()
|
||||
return similar_data
|
||||
|
||||
def get_titles_links_embeddings():
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;')
|
||||
data = cursor.fetchall()
|
||||
cursor.close()
|
||||
|
||||
titles = [row[0] for row in data]
|
||||
links = [row[1] for row in data]
|
||||
embeddings = [parse_embedding_string(row[2]) for row in data]
|
||||
|
||||
return titles, links, embeddings
|
||||
|
||||
|
||||
def insert_data(title, text, link, embedding, similar_d):
|
||||
c_time = datetime.now()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready)
|
||||
VALUES (%s, %s, %s, %s, %s ,%s ,%s);
|
||||
''', (title, text, link, embedding , similar_d, c_time, True))
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def get_data():
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT title,text,link FROM vectorsvevijesti;'''
|
||||
|
||||
cursor.execute(query)
|
||||
data = cursor.fetchall()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
return data
|
||||
|
||||
def create_db():
|
||||
conn = psycopg2.connect(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
password=password,
|
||||
dbname=dbname
|
||||
)
|
||||
def get_ready_data():
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
|
||||
cursor.execute(query, ('True',))
|
||||
data = cursor.fetchall()
|
||||
cursor.close()
|
||||
return data
|
||||
|
||||
def get_source_data():
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
|
||||
cursor.execute(query, ('False',))
|
||||
data = cursor.fetchall()
|
||||
cursor.close()
|
||||
return data
|
||||
|
||||
|
||||
def modify_similar_data(new_value ,title):
|
||||
cursor = conn.cursor()
|
||||
query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s '''
|
||||
cursor.execute(query, (new_value, title))
|
||||
conn.commit()
|
||||
|
||||
|
||||
def preparing_articles(new_value ,title):
|
||||
cursor = conn.cursor()
|
||||
query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s '''
|
||||
cursor.execute(query, (new_value, title))
|
||||
conn.commit()
|
||||
|
||||
def get_specific_data(title):
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT title, text, link, similar_d, embedding, ready FROM vectorsvevijesti WHERE title = %s'''
|
||||
cursor.execute(query, (title,))
|
||||
specific_post = cursor.fetchall()
|
||||
cursor.close()
|
||||
return specific_post
|
||||
|
||||
|
||||
def get_all_links():
|
||||
cursor = conn.cursor()
|
||||
query = '''SELECT link FROM vectorsvevijesti'''
|
||||
cursor.execute(query)
|
||||
db_links = {link[0] for link in cursor.fetchall()}
|
||||
cursor.close()
|
||||
return db_links
|
||||
|
||||
def delete_specific(title):
|
||||
cursor = conn.cursor()
|
||||
query = '''DELETE FROM vectorsvevijesti WHERE title = %s'''
|
||||
cursor.execute(query,(title,))
|
||||
cursor.close()
|
||||
|
||||
def cleansing():
|
||||
day_long = datetime.now() - timedelta(days=1)
|
||||
cursor = conn.cursor()
|
||||
query = '''DELETE FROM vectorsvevijesti WHERE time < %s'''
|
||||
cursor.execute(query,(day_long,))
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def drop_table():
|
||||
cursor = conn.cursor()
|
||||
query = '''DROP TABLE IF EXISTS vectorsvevijesti;'''
|
||||
cursor.execute(query)
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def create_db():
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
|
||||
|
||||
register_vector(conn)
|
||||
|
||||
cursor.execute("DROP TABLE IF EXISTS vectorsvevijesti;")
|
||||
|
||||
cursor.execute('''
|
||||
CREATE TABLE vectorsvevijesti (
|
||||
CREATE TABLE IF NOT EXISTS vectorsvevijesti (
|
||||
id bigserial PRIMARY KEY,
|
||||
title VARCHAR,
|
||||
text VARCHAR,
|
||||
link VARCHAR,
|
||||
embedding vector(1536)
|
||||
embedding vector(1536),
|
||||
similar_d VARCHAR,
|
||||
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
ready BOOLEAN
|
||||
);
|
||||
''')
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
create_db()
|
||||
create_db()
|
||||
|
||||
29
pyth/web-server.py
Normal file
29
pyth/web-server.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from flask import Flask , render_template , jsonify
|
||||
from vectData import get_ready_data
|
||||
from flask_cors import CORS
|
||||
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
CORS(app)
|
||||
|
||||
@app.route('/')
|
||||
def index() :
|
||||
return render_template("index.html")
|
||||
|
||||
|
||||
@app.route('/article/one')
|
||||
def articleone():
|
||||
return render_template("one.html")
|
||||
|
||||
|
||||
@app.route('/article/two')
|
||||
def articletwo():
|
||||
return render_template("two.html")
|
||||
|
||||
@app.route('/data/get/news', methods=['GET'])
|
||||
def takenews():
|
||||
data = get_ready_data()
|
||||
return jsonify(data)
|
||||
|
||||
app.run(debug=True)
|
||||
Reference in New Issue
Block a user