pyth/get_articles.py

from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
from openai import OpenAI 
import os
from langchain_openai import OpenAIEmbeddings
from db_management import (insert_data ,is_similar_data ,get_all_links,cleansing )
import json
from dotenv import load_dotenv
import tiktoken
from json_repair import repair_json

load_dotenv()
cleansing()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

client = OpenAI()
embeddings = OpenAIEmbeddings()

dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info','https://www.index.hr', 'https://avaz.ba', 'https://www.telegraf.rs', 'https://www.blic.rs', 'https://www.vijesti.me','https://dnevnik.hr','https://24sata.hr']
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}

def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(string))

def slice_text_at_2k_tokens(text):
    encoding_name = "gpt-3.5-turbo"
    max_tokens = 1950
    encoding = tiktoken.encoding_for_model(encoding_name)
    tokens = encoding.encode(text)
    if len(tokens) <= max_tokens:
        return [text] 
    sliced_tokens = tokens[:max_tokens]
    sliced_text = encoding.decode(sliced_tokens)
    return sliced_text

def slice_title_if_needed(text):
    encoding_name = "gpt-3.5-turbo"
    max_tokens = 100
    encoding = tiktoken.encoding_for_model(encoding_name)
    tokens = encoding.encode(text)
    if len(tokens) <= max_tokens:
        return [text] 
    sliced_tokens = tokens[:max_tokens]
    sliced_text = encoding.decode(sliced_tokens)
    return sliced_text

def replace_with_spaces(text):
    allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐđŠšŽž0123456789 "
    cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)
    return cleaned_text

def fix_links(links_set):
    modified_links = set()
    for link in links_set:
        if "www" in link:
            modified_link = link.replace("www.", "")
            modified_links.add(modified_link)
        else:
            modified_links.add(link)
    return modified_links

total_links = set()
collected_news = set()

def get_article_links(url, already_checked):
    response = requests.get(url,headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('article')
        link_store = []

        for article in articles:
            links = article.find_all('a', href=True)
            for link in links:
                link_value = urljoin(url, link['href'])
                if link_value not in already_checked:
                    link_store.append(link_value)
                    already_checked.add(link_value)
        return link_store

already_checked = set()

for dlink in dlinks:
    temp_links = get_article_links(dlink, already_checked)
    if temp_links:
        total_links.update(temp_links)
final_links = {item for item in total_links if item}

db_links = set(get_all_links())
new_links = final_links - db_links
final_links = new_links
final_links = set(final_links)
final_links = fix_links(final_links)

if __name__ == '__main__':
 
    for link in final_links:
        if link not in db_links:
            print(f"Processing link: {link}")
            db_links.add(link)

            response = requests.get(link,headers)
            soup = BeautifulSoup(response.text, 'html.parser')

            titles = soup.find_all(['h2', 'h1','h3'])
            title_text = ' '.join([title.get_text(strip=True) for title in titles])

            texts = soup.find_all(['p'])
            text_text = ' '.join([text.get_text(strip=True) for text in texts])

            text_text = text_text
            title_text = title_text
    
            title_text = replace_with_spaces(title_text)

            text_text = slice_text_at_2k_tokens(text_text)
            text_text = replace_with_spaces(str(text_text))

            ttk = num_tokens_from_string(text_text)

            category_options = ['politics','business','sport','magazine','scitech']

            category_translation = {
                 'politics': 'Politika',
                 'business': 'Biznis',
                 'sport': 'Sport',
                 'magazine': 'Magazin',
                 'scitech': 'Nauka i tehnologija',
                 'other': 'Ostalo',
                }

            if ttk > 1900:
                title_text = slice_title_if_needed(title_text)
            try:
                completion = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "Data analytic, Journalist and News reporter"},
                    {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title and remove 'FOTO' and 'VIDEO' from title and text, from {category_options} select category in wich that news belong,  and provide the cleaned data make sure that its on Bosnian language and valid JSON object with 'title' field, 'category' and 'content' field."}
                ])
                generated_text = completion.choices[0].message.content

                generated_text = repair_json(generated_text)

                response_data = json.loads(generated_text)
                title = response_data["title"]
                predicted_category = response_data["category"]
                text = response_data["content"]

                if predicted_category.lower() in category_options:
                    category = predicted_category.lower()
                else:
                    category = 'other'

                category = category_translation.get(category, category.capitalize())

                vector = embeddings.embed_query(generated_text)

                print(f"Category: {category}")
        
                if not is_similar_data(title, text, link, vector, threshold=0.98):
                    similar_d = "NO"
                    insert_data(title, text, link, vector,similar_d,category)

            except Exception as e:
                print(f"Error in completion: {e}")
                continue
Adding VDB 2023-12-25 12:31:55 +01:00			`from bs4 import BeautifulSoup`
			`import requests`
			`from urllib.parse import urljoin`
organizing code 2024-01-07 03:41:32 +01:00			`from openai import OpenAI`
Adding VDB 2023-12-25 12:31:55 +01:00			`import os`
Changing from js to golang 2024-01-29 14:55:20 +01:00			`from langchain_openai import OpenAIEmbeddings`
			`from db_management import (insert_data ,is_similar_data ,get_all_links,cleansing )`
Adding VDB 2023-12-25 12:31:55 +01:00			`import json`
Combine similar article 2024-01-02 15:00:07 +01:00			`from dotenv import load_dotenv`
			`import tiktoken`
Fixed response/JSON 2024-01-08 00:28:20 +01:00			`from json_repair import repair_json`
Adding VDB 2023-12-25 12:31:55 +01:00
Combine similar article 2024-01-02 15:00:07 +01:00			`load_dotenv()`
			`cleansing()`

			`OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")`

Adding VDB 2023-12-25 12:31:55 +01:00			`client = OpenAI()`
			`embeddings = OpenAIEmbeddings()`

Changing from js to golang 2024-01-29 14:55:20 +01:00			`dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info','https://www.index.hr', 'https://avaz.ba', 'https://www.telegraf.rs', 'https://www.blic.rs', 'https://www.vijesti.me','https://dnevnik.hr','https://24sata.hr']`
Adding VDB 2023-12-25 12:31:55 +01:00			`headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}`

Combine similar article 2024-01-02 15:00:07 +01:00			`def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:`
			`encoding = tiktoken.encoding_for_model(model)`
			`return len(encoding.encode(string))`

			`def slice_text_at_2k_tokens(text):`
			`encoding_name = "gpt-3.5-turbo"`
Fixed response/JSON 2024-01-08 00:28:20 +01:00			`max_tokens = 1950`
Combine similar article 2024-01-02 15:00:07 +01:00			`encoding = tiktoken.encoding_for_model(encoding_name)`
			`tokens = encoding.encode(text)`
			`if len(tokens) <= max_tokens:`
			`return [text]`
			`sliced_tokens = tokens[:max_tokens]`
			`sliced_text = encoding.decode(sliced_tokens)`
			`return sliced_text`

Fixed response/JSON 2024-01-08 00:28:20 +01:00			`def slice_title_if_needed(text):`
			`encoding_name = "gpt-3.5-turbo"`
			`max_tokens = 100`
			`encoding = tiktoken.encoding_for_model(encoding_name)`
			`tokens = encoding.encode(text)`
			`if len(tokens) <= max_tokens:`
			`return [text]`
			`sliced_tokens = tokens[:max_tokens]`
			`sliced_text = encoding.decode(sliced_tokens)`
			`return sliced_text`
Combine similar article 2024-01-02 15:00:07 +01:00
			`def replace_with_spaces(text):`
			`allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐđŠšŽž0123456789 "`
			`cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)`
			`return cleaned_text`

added article.py 2024-01-06 08:17:05 +01:00			`def fix_links(links_set):`
			`modified_links = set()`
			`for link in links_set:`
			`if "www" in link:`
			`modified_link = link.replace("www.", "")`
			`modified_links.add(modified_link)`
			`else:`
			`modified_links.add(link)`
			`return modified_links`

Adding VDB 2023-12-25 12:31:55 +01:00			`total_links = set()`
			`collected_news = set()`

			`def get_article_links(url, already_checked):`
			`response = requests.get(url,headers)`
			`if response.status_code == 200:`
			`soup = BeautifulSoup(response.text, 'html.parser')`
			`articles = soup.find_all('article')`
			`link_store = []`

			`for article in articles:`
			`links = article.find_all('a', href=True)`
			`for link in links:`
			`link_value = urljoin(url, link['href'])`
			`if link_value not in already_checked:`
			`link_store.append(link_value)`
			`already_checked.add(link_value)`
			`return link_store`

			`already_checked = set()`

			`for dlink in dlinks:`
			`temp_links = get_article_links(dlink, already_checked)`
			`if temp_links:`
			`total_links.update(temp_links)`
			`final_links = {item for item in total_links if item}`
Combine similar article 2024-01-02 15:00:07 +01:00
			`db_links = set(get_all_links())`
			`new_links = final_links - db_links`
			`final_links = new_links`
added article.py 2024-01-06 08:17:05 +01:00			`final_links = set(final_links)`
			`final_links = fix_links(final_links)`
Adding VDB 2023-12-25 12:31:55 +01:00
Combine similar article 2024-01-02 15:00:07 +01:00			`if __name__ == '__main__':`
Fixed response/JSON 2024-01-08 00:28:20 +01:00
Changing from js to golang 2024-01-29 14:55:20 +01:00			`for link in final_links:`
			`if link not in db_links:`
			`print(f"Processing link: {link}")`
			`db_links.add(link)`
Adding VDB 2023-12-25 12:31:55 +01:00
Changing from js to golang 2024-01-29 14:55:20 +01:00			`response = requests.get(link,headers)`
			`soup = BeautifulSoup(response.text, 'html.parser')`
Adding VDB 2023-12-25 12:31:55 +01:00
Changing from js to golang 2024-01-29 14:55:20 +01:00			`titles = soup.find_all(['h2', 'h1','h3'])`
			`title_text = ' '.join([title.get_text(strip=True) for title in titles])`
Combine similar article 2024-01-02 15:00:07 +01:00
Changing from js to golang 2024-01-29 14:55:20 +01:00			`texts = soup.find_all(['p'])`
			`text_text = ' '.join([text.get_text(strip=True) for text in texts])`

			`text_text = text_text`
			`title_text = title_text`
Combine similar article 2024-01-02 15:00:07 +01:00
Changing from js to golang 2024-01-29 14:55:20 +01:00			`title_text = replace_with_spaces(title_text)`

			`text_text = slice_text_at_2k_tokens(text_text)`
			`text_text = replace_with_spaces(str(text_text))`

			`ttk = num_tokens_from_string(text_text)`

			`category_options = ['politics','business','sport','magazine','scitech']`

Weather and Category 2024-01-31 12:37:55 +01:00			`category_translation = {`
			`'politics': 'Politika',`
			`'business': 'Biznis',`
			`'sport': 'Sport',`
			`'magazine': 'Magazin',`
			`'scitech': 'Nauka i tehnologija',`
			`'other': 'Ostalo',`
			`}`

Changing from js to golang 2024-01-29 14:55:20 +01:00			`if ttk > 1900:`
			`title_text = slice_title_if_needed(title_text)`
			`try:`
			`completion = client.chat.completions.create(`
			`model="gpt-3.5-turbo",`
			`messages=[`
			`{"role": "system", "content": "Data analytic, Journalist and News reporter"},`
			`{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title and remove 'FOTO' and 'VIDEO' from title and text, from {category_options} select category in wich that news belong, and provide the cleaned data make sure that its on Bosnian language and valid JSON object with 'title' field, 'category' and 'content' field."}`
			`])`
			`generated_text = completion.choices[0].message.content`

			`generated_text = repair_json(generated_text)`

			`response_data = json.loads(generated_text)`
			`title = response_data["title"]`
			`predicted_category = response_data["category"]`
			`text = response_data["content"]`

			`if predicted_category.lower() in category_options:`
			`category = predicted_category.lower()`
			`else:`
			`category = 'other'`

Weather and Category 2024-01-31 12:37:55 +01:00			`category = category_translation.get(category, category.capitalize())`

Changing from js to golang 2024-01-29 14:55:20 +01:00			`vector = embeddings.embed_query(generated_text)`

			`print(f"Category: {category}")`
Adding VDB 2023-12-25 12:31:55 +01:00
Changing from js to golang 2024-01-29 14:55:20 +01:00			`if not is_similar_data(title, text, link, vector, threshold=0.98):`
			`similar_d = "NO"`
			`insert_data(title, text, link, vector,similar_d,category)`
Combine similar article 2024-01-02 15:00:07 +01:00
Changing from js to golang 2024-01-29 14:55:20 +01:00			`except Exception as e:`
			`print(f"Error in completion: {e}")`
			`continue`
Combine similar article 2024-01-02 15:00:07 +01:00