Changing from js to golang

2024-01-29 14:55:20 +01:00
parent 30d8ca73da
commit f4a2251178
40 changed files with 1174 additions and 438 deletions
--- a/pyth/.env
+++ b/pyth/.env
@@ -2,6 +2,6 @@ OPENAI_API_KEY = "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"

 DB_HOST =localhost
 DB_PORT =5432
-DB_USER =postgres
+DB_USER =svevijesti
 DB_PASSWORD =salmonela pljusti 221 hamo
 DB_NAME =svevijestiweb
--- a/pyth/pycache/db_management.cpython-310.pyc
+++ b/pyth/pycache/db_management.cpython-310.pyc
--- a/pyth/pycache/get_articles.cpython-310.pyc
+++ b/pyth/pycache/get_articles.cpython-310.pyc
--- a/pyth/pycache/publishing_finals.cpython-310.pyc
+++ b/pyth/pycache/publishing_finals.cpython-310.pyc
--- a/pyth/pycache/scrapingsingle.cpython-310.pyc
+++ b/pyth/pycache/scrapingsingle.cpython-310.pyc
--- a/pyth/pycache/tttt.cpython-310.pyc
+++ b/pyth/pycache/tttt.cpython-310.pyc
--- a/pyth/pycache/vectData.cpython-310.pyc
+++ b/pyth/pycache/vectData.cpython-310.pyc
--- a/pyth/articles.py
+++ b/pyth/articles.py
@@ -1,241 +0,0 @@
-import psycopg2
-import numpy as np
-from dotenv import load_dotenv
-import os
-from openai import OpenAI
-from langchain.embeddings import OpenAIEmbeddings
-from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings
-from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens
-import json
-from json_repair import repair_json
-
-load_dotenv()
-
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
-client = OpenAI()
-embeddings = OpenAIEmbeddings()
-
-print(f"Checking for similar!")
-
-def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
-    try:
-        titles, links, embeddings = get_titles_links_embeddings()
-
-        processed_articles = set()
-        grouped_similar_articles = []
-
-        for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
-            if (title1, link1) not in processed_articles:
-                processed_articles.add((title1, link1))
-                group = [(title1, link1)]
-
-                for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
-                    if i != j and (title2, link2) not in processed_articles:
-                        similarity = calculate_cosine_similarity(embedding1, embedding2)
-
-                        if similarity > threshold:
-                            processed_articles.add((title2, link2))
-                            group.append((title2, link2))
-
-                grouped_similar_articles.append(group)
-
-        return grouped_similar_articles
-
-    except psycopg2.Error as e:
-        print(f"Error: {e}")
-        return []
-    
-def processing_similar():
-        grouped_similar_articles_result = find_and_group_similar_articles()
-
-        if grouped_similar_articles_result:
-            for group in grouped_similar_articles_result:
-                articles = []
-
-                if len(group) > 1:
-                    for article_tuple in group:
-                        if len(article_tuple) >= 2:
-                            title, link = article_tuple[:2]
-                            article = [title, link]
-                            articles.append(article)
-                    l = len(articles)
-
-                    if l == 2:
-                        a_one = articles[0][0]
-                        a_two = articles[1][0]
-
-                        get_one = get_specific_data(a_one)
-                        get_two = get_specific_data(a_two)
-
-                        text1 = get_one[0][1]
-                        text2 = get_two[0][1]
-                        link1 = get_one[0][2]
-                        link2 = get_two[0][2]
-                        if link1 != link2:
-                            link = f"{link1}, {link2}"
-                        else:
-                            link = link1
-
-                        ftoks = num_tokens_from_string(text1)
-                        stoks = num_tokens_from_string(text2)
-                        tokens = ftoks + stoks
-
-                        similar_d = f"C: {a_one}, {a_two}"
-
-                        modify_similar_data(similar_d, a_one)
-                        preparing_articles(False, a_one)
-
-                        modify_similar_data(similar_d, a_two)
-                        preparing_articles(False, a_two)
-
-                        if tokens > 2000:
-                            combined_text = f"{text1} {text2}"
-                            combined_text = slice_text_at_2k_tokens(combined_text)
-                            user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
-                        else:
-                            user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
-
-                    if l == 3:
-                        a_one = articles[0][0]
-                        a_two = articles[1][0]
-                        a_three = articles[2][0]
-
-                        get_one = get_specific_data(a_one)
-                        get_two = get_specific_data(a_two)
-                        get_three = get_specific_data(a_three)
-
-                        text1 = get_one[0][1]
-                        text2 = get_two[0][1]
-                        text3 = get_three[0][1]
-                        link1 = get_one[0][2]
-                        link2 = get_two[0][2]
-                        link3 = get_three[0][2]
-                        if link1 != link2:
-                            if link2 != link3:
-                                link = f"{link1}, {link2}, {link3}"
-                            else:
-                                link = f"{link1}, {link2}"
-                        else:
-                            if link2 != link3:
-                                link = f"{link1}, {link3}"
-                            else:
-                                link = link1
-                        ftoks = num_tokens_from_string(text1)
-                        stoks = num_tokens_from_string(text2)
-                        ttoks = num_tokens_from_string(text3)
-                        tokens = ftoks + stoks + ttoks
-
-                        similar_d = f"C: {a_one}, {a_two}, {a_three}"
-                        modify_similar_data(similar_d, a_one)
-                        preparing_articles(False, a_one)
-
-                        modify_similar_data(similar_d, a_two)
-                        preparing_articles(False, a_two)
-
-                        modify_similar_data(similar_d, a_three)
-                        preparing_articles(False, a_three)
-
-                        if tokens > 2000:
-                            combined_text = f"{text1} {text2} {text3}"
-                            combined_text = slice_text_at_2k_tokens(combined_text)
-                            user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field"
-                        else:
-                            user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field."
-                    if l == 4:
-                        a_one = articles[0][0]
-                        a_two = articles[1][0]
-                        a_three = articles[2][0]
-                        a_four = articles[3][0]
-
-                        get_one = get_specific_data(a_one)
-                        get_two = get_specific_data(a_two)
-                        get_three = get_specific_data(a_three)
-                        get_four = get_specific_data(a_four)
-
-                        text1 = get_one[0][1]
-                        text2 = get_two[0][1]
-                        text3 = get_three[0][1]
-                        text4 = get_four[0][1]
-                        link1 = get_one[0][2]
-                        link2 = get_two[0][2]
-                        link3 = get_three[0][2]
-                        link4 = get_four[0][2]
-
-                        if link1 != link2:
-                            if link2 != link3:
-                                if link3 != link4:
-                                    link = f"{link1}, {link2}, {link3}, {link4}"
-                                else:
-                                    link = f"{link1}, {link2}, {link3}"
-                            else:
-                                if link3 != link4:
-                                    link = f"{link1}, {link2}, {link4}"
-                                else:
-                                    link = f"{link1}, {link2}"
-                        else:
-                            if link2 != link3:
-                                if link3 != link4:
-                                    link = f"{link1}, {link3}, {link4}"
-                                else:
-                                    link = f"{link1}, {link3}"
-                            else:
-                                if link3 != link4:
-                                    link = f"{link1}, {link4}"
-                                else:
-                                    link = link1
-
-                        ftoks = num_tokens_from_string(text1)
-                        stoks = num_tokens_from_string(text2)
-                        ttoks = num_tokens_from_string(text3)
-                        frtoks = num_tokens_from_string(text4)
-
-                        tokens = ftoks + stoks + ttoks + frtoks
-
-                        similar_d = f"C: {a_one}, {a_two}, {a_three}, {a_four}"
-                        modify_similar_data(similar_d, a_one)
-                        preparing_articles(False, a_one)
-
-                        modify_similar_data(similar_d, a_two)
-                        preparing_articles(False, a_two)
-
-                        modify_similar_data(similar_d, a_three)
-                        preparing_articles(False, a_three)
-
-                        modify_similar_data(similar_d, a_four)
-                        preparing_articles(False, a_four)
-
-                        if tokens > 2000:
-                            combined_text = f"{text1} {text2} {text3} {text4}"
-                            combined_text = slice_text_at_2k_tokens(combined_text)
-                            user_message = rf"Here is text {combined_text}, combined from 4 sources, filter text, and make news content, return as JSON only with a single 'content' field"
-                        else:
-                            user_message = rf"Here are 4 texts {text1} {text2} {text3} and {text4}, combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single 'content' field."
-                    try:
-                        completion = client.chat.completions.create(
-                            model="gpt-3.5-turbo",
-                            messages=[
-                                {"role": "system", "content": "Data analytic, Journalist and News reporter"},
-                                {"role": "user", "content": user_message}
-                            ])
-                        generated_text = completion.choices[0].message.content
-
-                        generated_text = repair_json(generated_text)
-
-                        response_data = json.loads(generated_text)
-                        title = a_one
-                        text = response_data["content"]
-                        vector = embeddings.embed_query(generated_text)
-
-                        insert_data(title, text, link, vector, similar_d)
-                        print(f"Inserting combined: {title}")
-
-                    except Exception as e:
-                        print(f"Error: {e}")
-                        print(a_one)
-                        continue
-            else:
-                print("Done!.")
-        else:
-            print("No similar articles found.")
-if __name__=="__main__":
-    processing_similar()
--- a/pyth/checking_similar.py
+++ b/pyth/checking_similar.py
@@ -0,0 +1,122 @@
+import psycopg2
+from dotenv import load_dotenv
+import os
+from openai import OpenAI
+from langchain_openai import OpenAIEmbeddings
+from db_management import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity, get_titles_links_embeddings
+from get_articles import  slice_text_at_2k_tokens
+import json
+from json_repair import repair_json
+from publishing_finals import publish_articles
+
+load_dotenv()
+
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+client = OpenAI()
+embeddings = OpenAIEmbeddings()
+
+print("Checking for similar!")
+
+
+def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
+    try:
+        titles, links, embeddings = get_titles_links_embeddings()
+
+        processed_articles = set()
+        grouped_similar_articles = []
+
+        for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
+            if (title1, link1) not in processed_articles:
+                processed_articles.add((title1, link1))
+                group = [(title1, link1)]
+
+                for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
+                    if i != j and (title2, link2) not in processed_articles:
+                        similarity = calculate_cosine_similarity(embedding1, embedding2)
+
+                        if similarity > threshold:
+                            if link1 != link2:
+                                processed_articles.add((title2, link2))
+                                group.append((title2, link2, embedding2))
+
+                grouped_similar_articles.append(group)
+        return grouped_similar_articles
+
+    except psycopg2.Error as e:
+        print(f"Error: {e}")
+        return []
+
+
+def processing_articles(articles):
+    unique_links = set()
+    
+    for article in articles:
+        a_title, a_link = article[:2] 
+        get_data = get_specific_data(a_title)
+        text = get_data[0][1]
+        link = a_link
+
+        modify_similar_data(f"C: {', '.join(art[0] for art in articles)}", a_title)
+        preparing_articles(False, a_title)
+
+        if link not in unique_links:
+            unique_links.add(link)
+
+    combined_text = ' '.join(get_specific_data(art[0])[0][1] for art in articles)
+    combined_text = slice_text_at_2k_tokens(combined_text)
+
+    if len(unique_links) == 1:
+        link = next(iter(unique_links))
+    else:
+        link = ', '.join(unique_links)
+    return combined_text, link
+
+
+def processing_similar():
+    grouped_similar_articles_result = find_and_group_similar_articles()
+
+    if grouped_similar_articles_result:
+        for group in grouped_similar_articles_result:
+            articles = group
+
+            if len(articles) > 1:
+                combined_text, link = processing_articles(articles)
+                user_message = (
+                    rf"Here are {len(articles)} texts {combined_text}, combine the following texts into a cohesive news, "
+                    rf"remove any non-news related to all texts, and provide the cleaned data on Bosnian languageas and return as JSON only with a single 'content' field."
+                )
+
+                try:
+                    completion = client.chat.completions.create(
+                        model="gpt-3.5-turbo",
+                        messages=[
+                            {"role": "system", "content": "Data analytic, Journalist and News reporter"},
+                            {"role": "user", "content": user_message}
+                        ])
+                    generated_text = repair_json(completion.choices[0].message.content)
+
+                    response_data = json.loads(generated_text)
+                    title = articles[0][0]
+                    text = response_data["content"]
+                    vector = embeddings.embed_query(generated_text)
+                    tmpCategory = get_specific_data(title)
+                    category = tmpCategory[0][5]
+
+                    
+
+                    insert_data(title, text, link, vector, f"C: {', '.join(art[0] for art in articles)}", category)
+                    print(f"Inserting combined: {title} and Category: {category}")
+
+                except Exception as e:
+                    print(f"Error: {e}")
+                    print(articles[0][0])
+                    continue
+        else:
+            print("Done!.")
+    else:
+        print("No similar articles found.")
+
+
+if __name__ == "__main__":
+    processing_similar()
+    publish_articles()
--- a/pyth/db_management.py
+++ b/pyth/db_management.py
@@ -68,7 +68,7 @@ def is_similar_data(title, text, link, embedding, threshold=0.98):

 def get_similar():
    cursor = conn.cursor()
-    query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
+    query = '''SELECT title, link, similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
    cursor.execute(query)
    similar_data = cursor.fetchall()
    cursor.close()
@@ -87,18 +87,23 @@ def get_titles_links_embeddings():
    return titles, links, embeddings


-def insert_data(title, text, link, embedding, similar_d):
+def insert_data(title, text, link, embedding, similar_d,category):
    c_time = datetime.now()
    cursor = conn.cursor()
    cursor.execute('''
-        INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready)
-        VALUES (%s, %s, %s, %s, %s ,%s ,%s);
-    ''', (title, text, link, embedding , similar_d, c_time, True))
+        INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready, category)
+        VALUES (%s, %s, %s, %s, %s ,%s ,%s ,%s);
+    ''', (title, text, link, embedding , similar_d, c_time, True , category))
    conn.commit()
    cursor.close()

-def get_data():
+def insert_final(title,text,slug,link,source_id, category):
+    with conn.cursor() as cursor:
+        cursor.execute('''INSERT INTO articles (title, content, slug, original_url, source_id, category)
+        VALUES (%s, %s, %s, %s, %s, %s)ON CONFLICT (original_url) DO NOTHING;''',(title , text, slug, link, source_id, category))
+    conn.commit()

+def get_data():
    cursor = conn.cursor()
    query = '''SELECT title,text,link FROM vectorsvevijesti;'''
    cursor.execute(query)
@@ -108,7 +113,7 @@ def get_data():

 def get_ready_data():
    cursor = conn.cursor()
-    query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
+    query = '''SELECT title, text, link, time, similar_d, category FROM vectorsvevijesti WHERE ready = %s;'''
    cursor.execute(query, ('True',))
    data = cursor.fetchall()
    cursor.close()
@@ -122,14 +127,12 @@ def get_source_data():
    cursor.close()
    return data

-
 def modify_similar_data(new_value ,title):
    cursor = conn.cursor()
    query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s '''
    cursor.execute(query, (new_value, title))
    conn.commit()

-
 def preparing_articles(new_value ,title):
    cursor = conn.cursor()
    query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s '''
@@ -138,13 +141,12 @@ def preparing_articles(new_value ,title):

 def get_specific_data(title):
    cursor = conn.cursor()
-    query = '''SELECT title, text, link, similar_d, embedding, ready FROM vectorsvevijesti WHERE title = %s'''
+    query = '''SELECT title, text, link, similar_d, embedding, category, ready FROM vectorsvevijesti WHERE title = %s'''
    cursor.execute(query, (title,))
    specific_post = cursor.fetchall()
    cursor.close()
    return specific_post

-
 def get_all_links():
    cursor = conn.cursor()
    query = '''SELECT link FROM vectorsvevijesti'''
@@ -153,6 +155,14 @@ def get_all_links():
    cursor.close()
    return db_links

+def get_existing_titles():
+    cursor = conn.cursor()
+    query = '''SELECT title, original_url FROM articles'''
+    cursor.execute(query)
+    db_links = {link[0] for link in cursor.fetchall()}
+    cursor.close()
+    return db_links
+
 def delete_specific(title):
    cursor = conn.cursor()
    query = '''DELETE FROM vectorsvevijesti WHERE title = %s'''
@@ -192,4 +202,48 @@ def create_db():
    ''')
    conn.commit()
    cursor.close()
+
+def create_db():
+    cursor = conn.cursor()
+    cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
+    register_vector(conn)
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS vectorsvevijesti (
+            id bigserial PRIMARY KEY,
+            title VARCHAR,
+            text VARCHAR,
+            link VARCHAR,
+            embedding vector(1536),
+            similar_d VARCHAR,
+            time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            ready BOOLEAN,
+            category VARCHAR
+        );
+    ''')
+    conn.commit()
+    cursor.close()
+
+def create_ar_table():
+    cursor = conn.cursor()
+    cursor.execute('''
+    CREATE TABLE IF NOT EXISTS "articles" (
+        "id" bigserial PRIMARY KEY,
+        "title" text NOT NULL UNIQUE,
+        "content" text NOT NULL,
+        "slug" text NOT NULL UNIQUE,
+        "created_at" timestamptz DEFAULT NOW() NOT NULL,
+        "original_url" text NOT NULL UNIQUE,
+        "source_id" int NOT NULL,
+        "category" VARCHAR
+
+    );
+    ''')
+    conn.commit()
+    cursor.close()
+
+import psycopg2
+from psycopg2 import sql
+
+
 create_db()
+create_ar_table()
--- a/pyth/delete_db.py
+++ b/pyth/delete_db.py
@@ -0,0 +1,2 @@
+from db_management import delete_tables
+delete_tables()
--- a/pyth/scrapingsingle.py
+++ b/pyth/scrapingsingle.py
@@ -3,8 +3,8 @@ import requests
 from urllib.parse import urljoin
 from openai import OpenAI 
 import os
-from langchain.embeddings import OpenAIEmbeddings
-from vectData import (insert_data ,is_similar_data ,get_all_links,cleansing )
+from langchain_openai import OpenAIEmbeddings
+from db_management import (insert_data ,is_similar_data ,get_all_links,cleansing )
 import json
 from dotenv import load_dotenv
 import tiktoken
@@ -18,7 +18,7 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 client = OpenAI()
 embeddings = OpenAIEmbeddings()

-dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
+dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info','https://www.index.hr', 'https://avaz.ba', 'https://www.telegraf.rs', 'https://www.blic.rs', 'https://www.vijesti.me','https://dnevnik.hr','https://24sata.hr']
 headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}

 def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:
@@ -97,50 +97,65 @@ final_links = fix_links(final_links)

 if __name__ == '__main__':
 
- for link in final_links:
-    response = requests.get(link,headers)
-    soup = BeautifulSoup(response.text, 'html.parser')
+    for link in final_links:
+        if link not in db_links:
+            print(f"Processing link: {link}")
+            db_links.add(link)

-    titles = soup.find_all(['h2', 'h1','h3'])
-    title_text = ' '.join([title.get_text(strip=True) for title in titles])
+            response = requests.get(link,headers)
+            soup = BeautifulSoup(response.text, 'html.parser')

-    texts = soup.find_all(['p'])
-    text_text = ' '.join([text.get_text(strip=True) for text in texts])
+            titles = soup.find_all(['h2', 'h1','h3'])
+            title_text = ' '.join([title.get_text(strip=True) for title in titles])

-    text_text = text_text
-    title_text = title_text
+            texts = soup.find_all(['p'])
+            text_text = ' '.join([text.get_text(strip=True) for text in texts])
+
+            text_text = text_text
+            title_text = title_text
    
-    title_text = replace_with_spaces(title_text)
+            title_text = replace_with_spaces(title_text)

-    text_text = slice_text_at_2k_tokens(text_text)
-    text_text = replace_with_spaces(str(text_text))
+            text_text = slice_text_at_2k_tokens(text_text)
+            text_text = replace_with_spaces(str(text_text))

-    ttk = num_tokens_from_string(text_text)
+            ttk = num_tokens_from_string(text_text)

-    if ttk > 1900:
-        title_text = slice_title_if_needed(title_text)
-    try:
-        completion = client.chat.completions.create(
-            model="gpt-3.5-turbo",
-            messages=[
-                {"role": "system", "content": "Data analytic, Journalist and News reporter"},
-                {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data make sure that its valid JSON object with 'title' field and 'content' field."}
-            ]
-        )
-        generated_text = completion.choices[0].message.content
+            category_options = ['politics','business','sport','magazine','scitech']

-        generated_text = repair_json(generated_text)
+            if ttk > 1900:
+                title_text = slice_title_if_needed(title_text)
+            try:
+                completion = client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {"role": "system", "content": "Data analytic, Journalist and News reporter"},
+                    {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title and remove 'FOTO' and 'VIDEO' from title and text, from {category_options} select category in wich that news belong,  and provide the cleaned data make sure that its on Bosnian language and valid JSON object with 'title' field, 'category' and 'content' field."}
+                ])
+                generated_text = completion.choices[0].message.content

-        response_data = json.loads(generated_text)
-        title = response_data["title"]
-        text = response_data["content"]
-        vector = embeddings.embed_query(generated_text)
+                generated_text = repair_json(generated_text)
+
+                response_data = json.loads(generated_text)
+                title = response_data["title"]
+                predicted_category = response_data["category"]
+                text = response_data["content"]
+
+                if predicted_category.lower() in category_options:
+                    category = predicted_category.lower()
+                else:
+                    category = 'other'
+
+                vector = embeddings.embed_query(generated_text)
+
+                print(f"Title: {title}")
+                print(f"Category: {category}")
        
-        if not is_similar_data(title, text, link, vector, threshold=0.98):
-         similar_d = "NO"
-         insert_data(title, text, link, vector,similar_d)
+                if not is_similar_data(title, text, link, vector, threshold=0.98):
+                    similar_d = "NO"
+                    insert_data(title, text, link, vector,similar_d,category)

-    except Exception as e:
-        print(f"Error in completion: {e}")
-        continue
+            except Exception as e:
+                print(f"Error in completion: {e}")
+                continue

--- a/pyth/publishing_finals.py
+++ b/pyth/publishing_finals.py
@@ -0,0 +1,69 @@
+from slugify import slugify
+import random
+from db_management import get_ready_data,insert_final,get_existing_titles
+
+def create_slug(title):
+    base_slug = "{} {}".format(random.randint(1, 1000), title)
+    slug = slugify(base_slug)
+    return slug
+
+def get_source_id(link,similar):
+    if similar == "NO":
+        if "srpskainfo" in link:
+            return 1
+        elif "klix" in link:
+            return 2
+        elif "bljesak" in link:
+            return 3
+        elif "blic" in link:
+            return 4
+        elif "index.hr" in link:
+            return 6
+        elif "avaz" in link:
+            return 7
+        elif "telegraf" in link:
+            return 8
+        elif "vijesti.me" in link:
+            return 9
+        elif "dnevnik.hr" in link:
+            return 10
+        elif "24sata.hr" in link:
+            return 11
+        else:
+            return 0
+    else:
+        return 5
+
+data = get_ready_data()
+
+def remove_braces_and_quotes(text):
+    final_text = text.replace('{"', '')
+    final_text = final_text.replace('"}', '')
+
+    return final_text
+
+
+def publish_articles():
+    for d in data:
+        title = d[0]
+        text = d[1]
+        link = d[2]
+        similar_d = d[4]
+        category = d[5]
+        slug = create_slug(title)
+        source_id = get_source_id(link,similar_d)
+
+        check = get_existing_titles()
+
+        title_check = any(title in t for t in check)
+        link_check = any(link in l for l in check)
+
+        if title_check or link_check:
+            continue
+        else:
+            text = remove_braces_and_quotes(text)
+            title = remove_braces_and_quotes(title)
+            print(f"Source: {source_id}")
+            print(f"Link: {link}")
+            insert_final(title, text, slug, link, source_id, category)
+            print(f"Publishing: {title}")
--- a/pyth/templates/index.html
+++ b/pyth/templates/index.html
@@ -1,22 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Test Pyth</title>
-</head>
-<body>
-    <div>
-        <article>
-            <h2>Test Title 1</h2>
-            <p>Test Text 1</p>
-            <a href="/article/one"> First</a>
-        </article>
-        <article>
-            <h2>Test Title 2</h2>
-            <p>Test Text 2</p>
-            <a href="/article/two">Second</a>
-        </article>
-    </div>
-</body>
-</html>
--- a/pyth/templates/one.html
+++ b/pyth/templates/one.html
@@ -1,12 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Article</title>
-</head>
-<body>
-    <h2>Test Title</h2>
-    <p>Test Text</p>
-</body>
-</html>
--- a/pyth/templates/two.html
+++ b/pyth/templates/two.html
@@ -1,12 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Article</title>
-</head>
-<body>
-    <h2>Test Title</h2>
-    <p>Test Text</p>
-</body>
-</html>
--- a/pyth/tests/test_scrapingsingle.py
+++ b/pyth/tests/test_scrapingsingle.py
@@ -7,7 +7,7 @@ from langchain.vectorstores.pgvector import PGVector
 from openai import OpenAI
 import json
 from dotenv import load_dotenv
-from scrapingsingle import get_article_links, insert_data, is_similar_data
+from pyth.get_articles import get_article_links, insert_data, is_similar_data
 import os

 load_dotenv()
--- a/pyth/tests/test_vectData.py
+++ b/pyth/tests/test_vectData.py
@@ -2,7 +2,7 @@ import unittest
 import numpy as np
 import psycopg2
 import os
-from vectData import calculate_cosine_similarity, is_similar_data, insert_data, get_data, create_db
+from pyth.db_management import calculate_cosine_similarity, is_similar_data, insert_data, get_data, create_db

 class TestIntegration(unittest.TestCase):
    host = os.getenv("DB_HOST")
--- a/pyth/web-server.py
+++ b/pyth/web-server.py
@@ -1,29 +0,0 @@
-from flask import Flask , render_template , jsonify
-from vectData import get_ready_data
-from flask_cors import CORS
-
-
-app = Flask(__name__)
-
-CORS(app)
-
-@app.route('/')
-def index() :
-    return render_template("index.html")
-
-
-@app.route('/article/one')
-def articleone():
-    return render_template("one.html")
-
-
-@app.route('/article/two')
-def articletwo():
-    return render_template("two.html")
-
-@app.route('/data/get/news', methods=['GET'])
-def takenews():
-    data = get_ready_data()
-    return jsonify(data)
-
-app.run(debug=True)