Merge branch 'svevijesti-cs' into 'master'

Combine similar article See merge request kbr4/svevijesti!5
2024-01-08 09:45:03 +00:00
parent fff1c94a3d 54a41046ce
commit 30d8ca73da
17 changed files with 856 additions and 83 deletions
--- a/pyth/.env
+++ b/pyth/.env
@@ -0,0 +1,7 @@
+OPENAI_API_KEY = "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" 
+
+DB_HOST =localhost
+DB_PORT =5432
+DB_USER =postgres
+DB_PASSWORD =salmonela pljusti 221 hamo
+DB_NAME =svevijestiweb
--- a/pyth/.gitlab-ci.yml
+++ b/pyth/.gitlab-ci.yml
@@ -0,0 +1,21 @@
+stages:
+  - test
+
+variables:
+
+before_script:
+  - pip install -r requirements.txt 
+
+test_file1:
+  stage: test
+  script:
+    - python -m pytest tests/test_scrapingsingle.py
+  only:
+    - master
+
+test_file2:
+  stage: test
+  script:
+    - python -m pytest tests/test_vectData.py
+  only:
+    - master
--- a/pyth/pycache/articles.cpython-310.pyc
+++ b/pyth/pycache/articles.cpython-310.pyc
--- a/pyth/pycache/scrapingsingle.cpython-310.pyc
+++ b/pyth/pycache/scrapingsingle.cpython-310.pyc
--- a/pyth/pycache/vectData.cpython-310.pyc
+++ b/pyth/pycache/vectData.cpython-310.pyc
--- a/pyth/articles.py
+++ b/pyth/articles.py
@@ -0,0 +1,241 @@
+import psycopg2
+import numpy as np
+from dotenv import load_dotenv
+import os
+from openai import OpenAI
+from langchain.embeddings import OpenAIEmbeddings
+from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings
+from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens
+import json
+from json_repair import repair_json
+
+load_dotenv()
+
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+client = OpenAI()
+embeddings = OpenAIEmbeddings()
+
+print(f"Checking for similar!")
+
+def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
+    try:
+        titles, links, embeddings = get_titles_links_embeddings()
+
+        processed_articles = set()
+        grouped_similar_articles = []
+
+        for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
+            if (title1, link1) not in processed_articles:
+                processed_articles.add((title1, link1))
+                group = [(title1, link1)]
+
+                for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
+                    if i != j and (title2, link2) not in processed_articles:
+                        similarity = calculate_cosine_similarity(embedding1, embedding2)
+
+                        if similarity > threshold:
+                            processed_articles.add((title2, link2))
+                            group.append((title2, link2))
+
+                grouped_similar_articles.append(group)
+
+        return grouped_similar_articles
+
+    except psycopg2.Error as e:
+        print(f"Error: {e}")
+        return []
+    
+def processing_similar():
+        grouped_similar_articles_result = find_and_group_similar_articles()
+
+        if grouped_similar_articles_result:
+            for group in grouped_similar_articles_result:
+                articles = []
+
+                if len(group) > 1:
+                    for article_tuple in group:
+                        if len(article_tuple) >= 2:
+                            title, link = article_tuple[:2]
+                            article = [title, link]
+                            articles.append(article)
+                    l = len(articles)
+
+                    if l == 2:
+                        a_one = articles[0][0]
+                        a_two = articles[1][0]
+
+                        get_one = get_specific_data(a_one)
+                        get_two = get_specific_data(a_two)
+
+                        text1 = get_one[0][1]
+                        text2 = get_two[0][1]
+                        link1 = get_one[0][2]
+                        link2 = get_two[0][2]
+                        if link1 != link2:
+                            link = f"{link1}, {link2}"
+                        else:
+                            link = link1
+
+                        ftoks = num_tokens_from_string(text1)
+                        stoks = num_tokens_from_string(text2)
+                        tokens = ftoks + stoks
+
+                        similar_d = f"C: {a_one}, {a_two}"
+
+                        modify_similar_data(similar_d, a_one)
+                        preparing_articles(False, a_one)
+
+                        modify_similar_data(similar_d, a_two)
+                        preparing_articles(False, a_two)
+
+                        if tokens > 2000:
+                            combined_text = f"{text1} {text2}"
+                            combined_text = slice_text_at_2k_tokens(combined_text)
+                            user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
+                        else:
+                            user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
+
+                    if l == 3:
+                        a_one = articles[0][0]
+                        a_two = articles[1][0]
+                        a_three = articles[2][0]
+
+                        get_one = get_specific_data(a_one)
+                        get_two = get_specific_data(a_two)
+                        get_three = get_specific_data(a_three)
+
+                        text1 = get_one[0][1]
+                        text2 = get_two[0][1]
+                        text3 = get_three[0][1]
+                        link1 = get_one[0][2]
+                        link2 = get_two[0][2]
+                        link3 = get_three[0][2]
+                        if link1 != link2:
+                            if link2 != link3:
+                                link = f"{link1}, {link2}, {link3}"
+                            else:
+                                link = f"{link1}, {link2}"
+                        else:
+                            if link2 != link3:
+                                link = f"{link1}, {link3}"
+                            else:
+                                link = link1
+                        ftoks = num_tokens_from_string(text1)
+                        stoks = num_tokens_from_string(text2)
+                        ttoks = num_tokens_from_string(text3)
+                        tokens = ftoks + stoks + ttoks
+
+                        similar_d = f"C: {a_one}, {a_two}, {a_three}"
+                        modify_similar_data(similar_d, a_one)
+                        preparing_articles(False, a_one)
+
+                        modify_similar_data(similar_d, a_two)
+                        preparing_articles(False, a_two)
+
+                        modify_similar_data(similar_d, a_three)
+                        preparing_articles(False, a_three)
+
+                        if tokens > 2000:
+                            combined_text = f"{text1} {text2} {text3}"
+                            combined_text = slice_text_at_2k_tokens(combined_text)
+                            user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
+                        else:
+                            user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
+                    if l == 4:
+                        a_one = articles[0][0]
+                        a_two = articles[1][0]
+                        a_three = articles[2][0]
+                        a_four = articles[3][0]
+
+                        get_one = get_specific_data(a_one)
+                        get_two = get_specific_data(a_two)
+                        get_three = get_specific_data(a_three)
+                        get_four = get_specific_data(a_four)
+
+                        text1 = get_one[0][1]
+                        text2 = get_two[0][1]
+                        text3 = get_three[0][1]
+                        text4 = get_four[0][1]
+                        link1 = get_one[0][2]
+                        link2 = get_two[0][2]
+                        link3 = get_three[0][2]
+                        link4 = get_four[0][2]
+
+                        if link1 != link2:
+                            if link2 != link3:
+                                if link3 != link4:
+                                    link = f"{link1}, {link2}, {link3}, {link4}"
+                                else:
+                                    link = f"{link1}, {link2}, {link3}"
+                            else:
+                                if link3 != link4:
+                                    link = f"{link1}, {link2}, {link4}"
+                                else:
+                                    link = f"{link1}, {link2}"
+                        else:
+                            if link2 != link3:
+                                if link3 != link4:
+                                    link = f"{link1}, {link3}, {link4}"
+                                else:
+                                    link = f"{link1}, {link3}"
+                            else:
+                                if link3 != link4:
+                                    link = f"{link1}, {link4}"
+                                else:
+                                    link = link1
+
+                        ftoks = num_tokens_from_string(text1)
+                        stoks = num_tokens_from_string(text2)
+                        ttoks = num_tokens_from_string(text3)
+                        frtoks = num_tokens_from_string(text4)
+
+                        tokens = ftoks + stoks + ttoks + frtoks
+
+                        similar_d = f"C: {a_one}, {a_two}, {a_three}, {a_four}"
+                        modify_similar_data(similar_d, a_one)
+                        preparing_articles(False, a_one)
+
+                        modify_similar_data(similar_d, a_two)
+                        preparing_articles(False, a_two)
+
+                        modify_similar_data(similar_d, a_three)
+                        preparing_articles(False, a_three)
+
+                        modify_similar_data(similar_d, a_four)
+                        preparing_articles(False, a_four)
+
+                        if tokens > 2000:
+                            combined_text = f"{text1} {text2} {text3} {text4}"
+                            combined_text = slice_text_at_2k_tokens(combined_text)
+                            user_message = rf"Here is text {combined_text}, combined from 4 sources, filter text, and make news content, return as JSON only with a single 'content' field"
+                        else:
+                            user_message = rf"Here are 4 texts {text1} {text2} {text3} and {text4}, combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single 'content' field."
+                    try:
+                        completion = client.chat.completions.create(
+                            model="gpt-3.5-turbo",
+                            messages=[
+                                {"role": "system", "content": "Data analytic, Journalist and News reporter"},
+                                {"role": "user", "content": user_message}
+                            ])
+                        generated_text = completion.choices[0].message.content
+
+                        generated_text = repair_json(generated_text)
+
+                        response_data = json.loads(generated_text)
+                        title = a_one
+                        text = response_data["content"]
+                        vector = embeddings.embed_query(generated_text)
+
+                        insert_data(title, text, link, vector, similar_d)
+                        print(f"Inserting combined: {title}")
+
+                    except Exception as e:
+                        print(f"Error: {e}")
+                        print(a_one)
+                        continue
+            else:
+                print("Done!.")
+        else:
+            print("No similar articles found.")
+if __name__=="__main__":
+    processing_similar()
--- a/pyth/requirements.txt
+++ b/pyth/requirements.txt
@@ -0,0 +1,141 @@
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+anyio==4.2.0
+apturl==0.5.2
+async-timeout==4.0.3
+attrs==23.1.0
+beautifulsoup4==4.12.2
+blinker==1.7.0
+blis==0.7.11
+Brlapi==0.8.3
+catalogue==2.0.10
+certifi==2020.6.20
+chardet==4.0.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+colorama==0.4.4
+command-not-found==0.3
+confection==0.1.4
+cryptography==3.4.8
+cupshelpers==1.0
+cymem==2.0.8
+dataclasses-json==0.6.3
+DateTime==5.4
+dbus-python==1.2.18
+decorator==4.4.2
+defer==1.0.6
+distro==1.7.0
+distro-info==1.1+ubuntu0.1
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
+exceptiongroup==1.2.0
+Flask==3.0.0
+Flask-Cors==4.0.0
+frozenlist==1.4.1
+greenlet==1.1.2
+gyp==0.1
+h11==0.14.0
+httpcore==1.0.2
+httplib2==0.20.2
+httpx==0.25.2
+idna==3.3
+importlib-metadata==4.6.4
+itsdangerous==2.1.2
+jeepney==0.7.1
+Jinja2==3.1.2
+joblib==1.3.2
+jsonpatch==1.33
+jsonpointer==2.4
+keyring==23.5.0
+langchain==0.0.352
+langchain-community==0.0.6
+langchain-core==0.1.3
+langcodes==3.3.0
+langsmith==0.0.74
+language-selector==0.1
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+louis==3.20.0
+macaroonbakery==1.3.1
+MarkupSafe==2.1.3
+marshmallow==3.20.1
+more-itertools==8.10.0
+multidict==6.0.4
+murmurhash==1.0.10
+mypy-extensions==1.0.0
+netifaces==0.11.0
+numpy==1.26.2
+oauthlib==3.2.0
+olefile==0.46
+openai==1.5.0
+packaging==23.2
+pbr==5.8.0
+pexpect==4.8.0
+pgvector==0.2.4
+Pillow==9.0.1
+preshed==3.0.9
+protobuf==3.12.4
+psycopg==3.1.15
+psycopg2-binary==2.9.9
+ptyprocess==0.7.0
+pycairo==1.20.1
+pycups==2.0.1
+pydantic==2.5.2
+pydantic_core==2.14.5
+PyGObject==3.42.1
+PyJWT==2.3.0
+pymacaroons==0.13.0
+PyNaCl==1.5.0
+pyparsing==2.4.7
+pyRFC3339==1.1
+python-apt==2.4.0+ubuntu2
+python-dateutil==2.8.1
+python-debian==0.1.43+ubuntu1.1
+python-dotenv==1.0.0
+pytz==2022.1
+pyxdg==0.27
+PyYAML==5.4.1
+regex==2023.10.3
+reportlab==3.6.8
+requests==2.31.0
+scikit-learn==1.3.2
+scipy==1.11.4
+SecretStorage==3.3.1
+six==1.16.0
+slugify==0.0.1
+smart-open==6.4.0
+sniffio==1.3.0
+soupsieve==2.5
+spacy==3.7.2
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+SQLAlchemy==1.4.31
+sqlalchemy-migrate==0.13.0
+sqlparse==0.4.2
+srsly==2.4.8
+systemd-python==234
+Tempita==0.5.2
+tenacity==8.2.3
+thinc==8.2.2
+threadpoolctl==3.2.0
+tiktoken==0.5.2
+tqdm==4.66.1
+typer==0.9.0
+typing-inspect==0.9.0
+typing_extensions==4.9.0
+ubuntu-advantage-tools==8001
+ubuntu-drivers-common==0.0.0
+ufw==0.36.1
+unattended-upgrades==0.1
+urllib3==1.26.5
+wadllib==1.3.6
+wasabi==1.1.2
+weasel==0.3.4
+Werkzeug==3.0.1
+xdg==5
+xkit==0.0.0
+yarl==1.9.4
+zipp==1.0.0
+zope.interface==6.1
--- a/pyth/scrapingsingle.py
+++ b/pyth/scrapingsingle.py
@@ -1,21 +1,66 @@
 from bs4 import BeautifulSoup
 import requests
 from urllib.parse import urljoin
-from openai import OpenAI
+from openai import OpenAI 
 import os
 from langchain.embeddings import OpenAIEmbeddings
-from langchain.vectorstores.pgvector import PGVector
-from vectData import insert_data ,is_similar_data 
+from vectData import (insert_data ,is_similar_data ,get_all_links,cleansing )
 import json
+from dotenv import load_dotenv
+import tiktoken
+from json_repair import repair_json

+load_dotenv()
+cleansing()
+
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

-os.environ["OPENAI_API_KEY"] = "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"
 client = OpenAI()
 embeddings = OpenAIEmbeddings()

 dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
 headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}

+def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:
+    encoding = tiktoken.encoding_for_model(model)
+    return len(encoding.encode(string))
+
+def slice_text_at_2k_tokens(text):
+    encoding_name = "gpt-3.5-turbo"
+    max_tokens = 1950
+    encoding = tiktoken.encoding_for_model(encoding_name)
+    tokens = encoding.encode(text)
+    if len(tokens) <= max_tokens:
+        return [text] 
+    sliced_tokens = tokens[:max_tokens]
+    sliced_text = encoding.decode(sliced_tokens)
+    return sliced_text
+
+def slice_title_if_needed(text):
+    encoding_name = "gpt-3.5-turbo"
+    max_tokens = 100
+    encoding = tiktoken.encoding_for_model(encoding_name)
+    tokens = encoding.encode(text)
+    if len(tokens) <= max_tokens:
+        return [text] 
+    sliced_tokens = tokens[:max_tokens]
+    sliced_text = encoding.decode(sliced_tokens)
+    return sliced_text
+
+def replace_with_spaces(text):
+    allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐđŠšŽž0123456789 "
+    cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)
+    return cleaned_text
+
+def fix_links(links_set):
+    modified_links = set()
+    for link in links_set:
+        if "www" in link:
+            modified_link = link.replace("www.", "")
+            modified_links.add(modified_link)
+        else:
+            modified_links.add(link)
+    return modified_links

 total_links = set()
 collected_news = set()
@@ -42,10 +87,17 @@ for dlink in dlinks:
    temp_links = get_article_links(dlink, already_checked)
    if temp_links:
        total_links.update(temp_links)
-
 final_links = {item for item in total_links if item}

-for link in final_links:
+db_links = set(get_all_links())
+new_links = final_links - db_links
+final_links = new_links
+final_links = set(final_links)
+final_links = fix_links(final_links)
+
+if __name__ == '__main__':
+ 
+ for link in final_links:
    response = requests.get(link,headers)
    soup = BeautifulSoup(response.text, 'html.parser')

@@ -54,34 +106,41 @@ for link in final_links:

    texts = soup.find_all(['p'])
    text_text = ' '.join([text.get_text(strip=True) for text in texts])
+
+    text_text = text_text
+    title_text = title_text
    
+    title_text = replace_with_spaces(title_text)
+
+    text_text = slice_text_at_2k_tokens(text_text)
+    text_text = replace_with_spaces(str(text_text))
+
+    ttk = num_tokens_from_string(text_text)
+
+    if ttk > 1900:
+        title_text = slice_title_if_needed(title_text)
    try:
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "Data analytic, Journalist and News reporter"},
-                {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
+                {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data make sure that its valid JSON object with 'title' field and 'content' field."}
            ]
        )
        generated_text = completion.choices[0].message.content

+        generated_text = repair_json(generated_text)
+
        response_data = json.loads(generated_text)
-        
        title = response_data["title"]
        text = response_data["content"]
-
-        print("*********************************")
-        print(f"Title: {title}")
-        print("---------------------------------")
-        print(f"Content : {text}")
-        print("*********************************")
-
-
        vector = embeddings.embed_query(generated_text)
-
-        if not is_similar_data(title, text, link, vector, threshold=0.9):
-         insert_data(title, text, link, vector)
        
+        if not is_similar_data(title, text, link, vector, threshold=0.98):
+         similar_d = "NO"
+         insert_data(title, text, link, vector,similar_d)
+
    except Exception as e:
        print(f"Error in completion: {e}")
        continue
+
--- a/pyth/templates/index.html
+++ b/pyth/templates/index.html
@@ -0,0 +1,22 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Test Pyth</title>
+</head>
+<body>
+    <div>
+        <article>
+            <h2>Test Title 1</h2>
+            <p>Test Text 1</p>
+            <a href="/article/one"> First</a>
+        </article>
+        <article>
+            <h2>Test Title 2</h2>
+            <p>Test Text 2</p>
+            <a href="/article/two">Second</a>
+        </article>
+    </div>
+</body>
+</html>
--- a/pyth/templates/one.html
+++ b/pyth/templates/one.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Article</title>
+</head>
+<body>
+    <h2>Test Title</h2>
+    <p>Test Text</p>
+</body>
+</html>
--- a/pyth/templates/two.html
+++ b/pyth/templates/two.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Article</title>
+</head>
+<body>
+    <h2>Test Title</h2>
+    <p>Test Text</p>
+</body>
+</html>
--- a/pyth/tests/pycache/test_scrapingsingle.cpython-310.pyc
+++ b/pyth/tests/pycache/test_scrapingsingle.cpython-310.pyc
--- a/pyth/tests/pycache/test_vectData.cpython-310.pyc
+++ b/pyth/tests/pycache/test_vectData.cpython-310.pyc
--- a/pyth/tests/test_scrapingsingle.py
+++ b/pyth/tests/test_scrapingsingle.py
@@ -0,0 +1,60 @@
+import unittest
+from unittest.mock import patch
+import requests
+from bs4 import BeautifulSoup
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores.pgvector import PGVector
+from openai import OpenAI
+import json
+from dotenv import load_dotenv
+from scrapingsingle import get_article_links, insert_data, is_similar_data
+import os
+
+load_dotenv()
+
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+client = OpenAI()
+embeddings = OpenAIEmbeddings()
+
+
+already_checked = set()
+total_links = set()
+collected_news = set()
+dlinks = 'http://127.0.0.1:5000/'
+
+class TestIntegration(unittest.TestCase):
+
+
+    def test_integration(self):
+        link = get_article_links(dlinks,already_checked)
+        self.assertEqual(len(already_checked), 2)
+
+        for link in total_links:
+            response = requests.get(link)
+            soup = BeautifulSoup(response.text, 'html.parser')
+
+            titles = soup.find_all(['h2', 'h1', 'h3'])
+            title_text = ' '.join([title.get_text(strip=True) for title in titles])
+
+            texts = soup.find_all(['p'])
+            text_text = ' '.join([text.get_text(strip=True) for text in texts])
+
+            completion = client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {"role": "system", "content": "Data analytic, Journalist and News reporter"},
+                    {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
+                ]
+            )
+            generated_text = completion.choices[0].message.content
+
+            response_data = json.loads(generated_text)
+            title = response_data["title"]
+            text = response_data["content"]
+
+            vector = embeddings.embed_query(generated_text)
+
+            self.assertIn("Test Title", title)
+            self.assertIn("Test Text", text)
+            self.assertEqual(len(total_links), 2)
+
--- a/pyth/tests/test_vectData.py
+++ b/pyth/tests/test_vectData.py
@@ -0,0 +1,89 @@
+import unittest
+import numpy as np
+import psycopg2
+import os
+from vectData import calculate_cosine_similarity, is_similar_data, insert_data, get_data, create_db
+
+class TestIntegration(unittest.TestCase):
+    host = os.getenv("DB_HOST")
+    port = os.getenv("DB_PORT")
+    user = os.getenv("DB_USER")
+    password = os.getenv("DB_PASSWORD")
+    dbname = os.getenv("DB_NAME")
+
+    @classmethod
+    def setUpClass(cls):
+        cls.host = os.getenv("DB_HOST")
+        cls.port = os.getenv("DB_PORT")
+        cls.user = os.getenv("DB_USER")
+        cls.password = os.getenv("DB_PASSWORD")
+        cls.dbname = os.getenv("DB_NAME")
+
+        cls.conn = psycopg2.connect(
+            host=cls.host,
+            port=cls.port,
+            user=cls.user,
+            password=cls.password,
+            dbname=cls.dbname
+        )
+        create_db(cls.conn)
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.conn.close()
+
+    def setUp(self):
+        if self.conn.closed:
+            self.conn = psycopg2.connect(
+                host=self.host,
+                port=self.port,
+                user=self.user,
+                password=self.password,
+                dbname=self.dbname
+            )
+        self.cursor = self.conn.cursor()
+
+    def tearDown(self):
+        if not self.cursor.closed:
+            self.cursor.close()
+
+        if not self.conn.closed:
+            self.conn.close()
+
+    def test_insert_and_retrieve_data(self):
+        title = 'test_title'
+        text = 'test_text'
+        link = 'test_link'
+        embedding = np.arange(1, 1537)
+
+        insert_data(title, text, link, embedding)
+
+        data = get_data()
+
+        self.assertEqual(data, [(title, text, link)])
+
+    def test_is_similar_data_integration(self):
+        title = 'test_title'
+        text = 'test_text'
+        link = 'test_link'
+        embedding = np.arange(1, 1537)
+
+        insert_data(title, text, link, embedding)
+
+        result = is_similar_data(title, text, link, embedding)
+        self.assertTrue(result)
+
+        result = is_similar_data(title, text, link, embedding)
+        self.assertTrue(result)
+
+        result = is_similar_data(title, text, link, embedding)
+        self.assertTrue(result)
+
+    def test_create_db_integration(self):
+        cursor = self.conn.cursor()
+        cursor.execute("SELECT * FROM information_schema.tables WHERE table_name = 'vectorsvevijesti'")
+        table_exist = bool(cursor.fetchone())
+        self.assertTrue(table_exist)
+
+if __name__ == '__main__':
+    unittest.main()
--- a/pyth/vectData.py
+++ b/pyth/vectData.py
@@ -3,113 +3,193 @@ from psycopg2 import sql
 from pgvector.psycopg2 import register_vector
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
+import os
+from dotenv import load_dotenv
+from datetime import datetime ,timedelta

-host = 'localhost'
-port = '5432'
-user = 'postgres'
-password = 'salmonela pljusti 221 hamo'
-dbname = 'vector_svw'
+load_dotenv()

-def calculate_cosine_similarity(v1, v2):
-    v1_normalized = v1 / np.linalg.norm(v1)
-    v2_normalized = v2 / np.linalg.norm(v2)
+host = os.getenv("DB_HOST")
+port = os.getenv("DB_PORT")
+user = os.getenv("DB_USER")
+password = os.getenv("DB_PASSWORD")
+dbname = os.getenv("DB_NAME")

-    similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
-    return similarity
-
-def is_similar_data(title, text, link, embedding, threshold=0.9):
-    conn = psycopg2.connect(
+conn = psycopg2.connect(
        host=host,
        port=port,
        user=user,
        password=password,
        dbname=dbname
    )
-    cursor = conn.cursor()

-    cursor.execute('SELECT title ,embedding FROM vectorsvevijesti;')
+def calculate_cosine_similarity(v1, v2):
+    v1_normalized = v1 / np.linalg.norm(v1)
+    v2_normalized = v2 / np.linalg.norm(v2)
+    similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
+    return similarity
+
+def parse_embedding_string(embedding_str):
+    if isinstance(embedding_str, str):
+        numbers = [float(num) for num in embedding_str[1:-1].split(',')]
+        return np.array(numbers)
+    elif isinstance(embedding_str, np.ndarray):
+        return embedding_str
+    else:
+        raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.")
+
+def is_similar_data(title, text, link, embedding, threshold=0.98):
+    cursor = conn.cursor()
+    cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;')
    existing_embeddings = cursor.fetchall()

    for existing_embedding_tuple in existing_embeddings:
        existing_title = existing_embedding_tuple[0]
        existing_embedding = np.array(existing_embedding_tuple[1]).flatten()
+        existing_link = existing_embedding_tuple[2]
        similarity = calculate_cosine_similarity(existing_embedding, embedding)
        if similarity > threshold:
-            print(f"Similar data found: \n #{title} \n #{existing_title}")
-            cursor.close()
-            conn.close()
-            return True
+            if link != existing_link:
+                similar_d = existing_title
+                insert_data(title,text,link,embedding,similar_d)
+                print(f"Similar data found: \n #{title} \n #{existing_title}")
+                print(f"Inserting: #{title}")
+                similar_d = "NO"
+                cursor.close()
+                return True
+            else:
+                print(f"Same article of same source!")
+                cursor.close()
+                return True

    print(f"Inserting: #{title}")
    cursor.close()
-    conn.close()
    return False

-def insert_data(title, text, link, embedding):
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        dbname=dbname
-    )
+def get_similar():
    cursor = conn.cursor()
-
-    cursor.execute('''
-        INSERT INTO vectorsvevijesti (title, text, link, embedding)
-        VALUES (%s, %s, %s, %s);
-    ''', (title, text, link, embedding))
-
-    conn.commit()
-
+    query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
+    cursor.execute(query)
+    similar_data = cursor.fetchall()
+    cursor.close()
+    return similar_data
+
+def get_titles_links_embeddings():
+    cursor = conn.cursor()
+    cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;')
+    data = cursor.fetchall()
+    cursor.close()
+
+    titles = [row[0] for row in data]
+    links = [row[1] for row in data]
+    embeddings = [parse_embedding_string(row[2]) for row in data]
+
+    return titles, links, embeddings
+
+
+def insert_data(title, text, link, embedding, similar_d):
+    c_time = datetime.now()
+    cursor = conn.cursor()
+    cursor.execute('''
+        INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready)
+        VALUES (%s, %s, %s, %s, %s ,%s ,%s);
+    ''', (title, text, link, embedding , similar_d, c_time, True))
+    conn.commit()
    cursor.close()
-    conn.close()

 def get_data():
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        dbname=dbname
-    )
+
    cursor = conn.cursor()
    query = '''SELECT title,text,link FROM vectorsvevijesti;'''
-
    cursor.execute(query)
    data = cursor.fetchall()
    cursor.close()
-    conn.close()
-
    return data

-def create_db():
-    conn = psycopg2.connect(
-        host=host,
-        port=port,
-        user=user,
-        password=password,
-        dbname=dbname
-    )
+def get_ready_data():
    cursor = conn.cursor()
+    query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
+    cursor.execute(query, ('True',))
+    data = cursor.fetchall()
+    cursor.close()
+    return data

+def get_source_data():
+    cursor = conn.cursor()
+    query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
+    cursor.execute(query, ('False',))
+    data = cursor.fetchall()
+    cursor.close()
+    return data
+
+
+def modify_similar_data(new_value ,title):
+    cursor = conn.cursor()
+    query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s '''
+    cursor.execute(query, (new_value, title))
+    conn.commit()
+
+
+def preparing_articles(new_value ,title):
+    cursor = conn.cursor()
+    query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s '''
+    cursor.execute(query, (new_value, title))
+    conn.commit()
+
+def get_specific_data(title):
+    cursor = conn.cursor()
+    query = '''SELECT title, text, link, similar_d, embedding, ready FROM vectorsvevijesti WHERE title = %s'''
+    cursor.execute(query, (title,))
+    specific_post = cursor.fetchall()
+    cursor.close()
+    return specific_post
+
+
+def get_all_links():
+    cursor = conn.cursor()
+    query = '''SELECT link FROM vectorsvevijesti'''
+    cursor.execute(query)
+    db_links = {link[0] for link in cursor.fetchall()}
+    cursor.close()
+    return db_links
+
+def delete_specific(title):
+    cursor = conn.cursor()
+    query = '''DELETE FROM vectorsvevijesti WHERE title = %s'''
+    cursor.execute(query,(title,))
+    cursor.close()
+
+def cleansing():
+    day_long = datetime.now() - timedelta(days=1)
+    cursor = conn.cursor()
+    query = '''DELETE FROM vectorsvevijesti WHERE time < %s'''
+    cursor.execute(query,(day_long,))
+    conn.commit()
+    cursor.close()
+
+def drop_table():
+    cursor = conn.cursor()
+    query = '''DROP TABLE IF EXISTS vectorsvevijesti;'''
+    cursor.execute(query)
+    conn.commit()
+    cursor.close()
+
+def create_db():
+    cursor = conn.cursor()
    cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
-
    register_vector(conn)
-
-    cursor.execute("DROP TABLE IF EXISTS vectorsvevijesti;")
-
    cursor.execute('''
-        CREATE TABLE vectorsvevijesti (
+        CREATE TABLE IF NOT EXISTS vectorsvevijesti (
            id bigserial PRIMARY KEY,
            title VARCHAR,
            text VARCHAR,
            link VARCHAR,
-            embedding vector(1536)
+            embedding vector(1536),
+            similar_d VARCHAR,
+            time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            ready BOOLEAN
        );
    ''')
-
    conn.commit()
    cursor.close()
-    conn.close()
-create_db()
+create_db()
--- a/pyth/web-server.py
+++ b/pyth/web-server.py
@@ -0,0 +1,29 @@
+from flask import Flask , render_template , jsonify
+from vectData import get_ready_data
+from flask_cors import CORS
+
+
+app = Flask(__name__)
+
+CORS(app)
+
+@app.route('/')
+def index() :
+    return render_template("index.html")
+
+
+@app.route('/article/one')
+def articleone():
+    return render_template("one.html")
+
+
+@app.route('/article/two')
+def articletwo():
+    return render_template("two.html")
+
+@app.route('/data/get/news', methods=['GET'])
+def takenews():
+    data = get_ready_data()
+    return jsonify(data)
+
+app.run(debug=True)