Changing from js to golang

2024-01-29 14:55:20 +01:00
parent 30d8ca73da
commit f4a2251178
40 changed files with 1174 additions and 438 deletions
--- a/pyth/get_articles.py
+++ b/pyth/get_articles.py
@@ -0,0 +1,161 @@
+from bs4 import BeautifulSoup
+import requests
+from urllib.parse import urljoin
+from openai import OpenAI 
+import os
+from langchain_openai import OpenAIEmbeddings
+from db_management import (insert_data ,is_similar_data ,get_all_links,cleansing )
+import json
+from dotenv import load_dotenv
+import tiktoken
+from json_repair import repair_json
+
+load_dotenv()
+cleansing()
+
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+
+client = OpenAI()
+embeddings = OpenAIEmbeddings()
+
+dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info','https://www.index.hr', 'https://avaz.ba', 'https://www.telegraf.rs', 'https://www.blic.rs', 'https://www.vijesti.me','https://dnevnik.hr','https://24sata.hr']
+headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
+
+def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:
+    encoding = tiktoken.encoding_for_model(model)
+    return len(encoding.encode(string))
+
+def slice_text_at_2k_tokens(text):
+    encoding_name = "gpt-3.5-turbo"
+    max_tokens = 1950
+    encoding = tiktoken.encoding_for_model(encoding_name)
+    tokens = encoding.encode(text)
+    if len(tokens) <= max_tokens:
+        return [text] 
+    sliced_tokens = tokens[:max_tokens]
+    sliced_text = encoding.decode(sliced_tokens)
+    return sliced_text
+
+def slice_title_if_needed(text):
+    encoding_name = "gpt-3.5-turbo"
+    max_tokens = 100
+    encoding = tiktoken.encoding_for_model(encoding_name)
+    tokens = encoding.encode(text)
+    if len(tokens) <= max_tokens:
+        return [text] 
+    sliced_tokens = tokens[:max_tokens]
+    sliced_text = encoding.decode(sliced_tokens)
+    return sliced_text
+
+def replace_with_spaces(text):
+    allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐđŠšŽž0123456789 "
+    cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)
+    return cleaned_text
+
+def fix_links(links_set):
+    modified_links = set()
+    for link in links_set:
+        if "www" in link:
+            modified_link = link.replace("www.", "")
+            modified_links.add(modified_link)
+        else:
+            modified_links.add(link)
+    return modified_links
+
+total_links = set()
+collected_news = set()
+
+def get_article_links(url, already_checked):
+    response = requests.get(url,headers)
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.text, 'html.parser')
+        articles = soup.find_all('article')
+        link_store = []
+
+        for article in articles:
+            links = article.find_all('a', href=True)
+            for link in links:
+                link_value = urljoin(url, link['href'])
+                if link_value not in already_checked:
+                    link_store.append(link_value)
+                    already_checked.add(link_value)
+        return link_store
+
+already_checked = set()
+
+for dlink in dlinks:
+    temp_links = get_article_links(dlink, already_checked)
+    if temp_links:
+        total_links.update(temp_links)
+final_links = {item for item in total_links if item}
+
+db_links = set(get_all_links())
+new_links = final_links - db_links
+final_links = new_links
+final_links = set(final_links)
+final_links = fix_links(final_links)
+
+if __name__ == '__main__':
+ 
+    for link in final_links:
+        if link not in db_links:
+            print(f"Processing link: {link}")
+            db_links.add(link)
+
+            response = requests.get(link,headers)
+            soup = BeautifulSoup(response.text, 'html.parser')
+
+            titles = soup.find_all(['h2', 'h1','h3'])
+            title_text = ' '.join([title.get_text(strip=True) for title in titles])
+
+            texts = soup.find_all(['p'])
+            text_text = ' '.join([text.get_text(strip=True) for text in texts])
+
+            text_text = text_text
+            title_text = title_text
+    
+            title_text = replace_with_spaces(title_text)
+
+            text_text = slice_text_at_2k_tokens(text_text)
+            text_text = replace_with_spaces(str(text_text))
+
+            ttk = num_tokens_from_string(text_text)
+
+            category_options = ['politics','business','sport','magazine','scitech']
+
+            if ttk > 1900:
+                title_text = slice_title_if_needed(title_text)
+            try:
+                completion = client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {"role": "system", "content": "Data analytic, Journalist and News reporter"},
+                    {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title and remove 'FOTO' and 'VIDEO' from title and text, from {category_options} select category in wich that news belong,  and provide the cleaned data make sure that its on Bosnian language and valid JSON object with 'title' field, 'category' and 'content' field."}
+                ])
+                generated_text = completion.choices[0].message.content
+
+                generated_text = repair_json(generated_text)
+
+                response_data = json.loads(generated_text)
+                title = response_data["title"]
+                predicted_category = response_data["category"]
+                text = response_data["content"]
+
+                if predicted_category.lower() in category_options:
+                    category = predicted_category.lower()
+                else:
+                    category = 'other'
+
+                vector = embeddings.embed_query(generated_text)
+
+                print(f"Title: {title}")
+                print(f"Category: {category}")
+        
+                if not is_similar_data(title, text, link, vector, threshold=0.98):
+                    similar_d = "NO"
+                    insert_data(title, text, link, vector,similar_d,category)
+
+            except Exception as e:
+                print(f"Error in completion: {e}")
+                continue
+