pyth/scrapingsingle.py

from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
from openai import OpenAI , APIError 
import os
from langchain.embeddings import OpenAIEmbeddings
from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data,get_all_links,cleansing ,modify_similar_data)
import json
from dotenv import load_dotenv
import tiktoken


load_dotenv()
cleansing()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

client = OpenAI()
embeddings = OpenAIEmbeddings()

dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}


def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(string))

def slice_text_at_2k_tokens(text):
    encoding_name = "gpt-3.5-turbo"
    max_tokens = 2000

    encoding = tiktoken.encoding_for_model(encoding_name)
    tokens = encoding.encode(text)

    if len(tokens) <= max_tokens:
        return [text] 

    sliced_tokens = tokens[:max_tokens]
    sliced_text = encoding.decode(sliced_tokens)
    
    return sliced_text


def replace_with_spaces(text):
    allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐđŠšŽž0123456789 "
    cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)
    return cleaned_text


def fix_links(links_set):
    modified_links = set()

    for link in links_set:
        if "www" in link:
            modified_link = link.replace("www.", "")
            modified_links.add(modified_link)
        else:
            modified_links.add(link)

    return modified_links

total_links = set()
collected_news = set()


def get_article_links(url, already_checked):
    response = requests.get(url,headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('article')
        link_store = []

        for article in articles:
            links = article.find_all('a', href=True)
            for link in links:
                link_value = urljoin(url, link['href'])
                if link_value not in already_checked:
                    link_store.append(link_value)
                    already_checked.add(link_value)
        return link_store


already_checked = set()

for dlink in dlinks:
    temp_links = get_article_links(dlink, already_checked)
    if temp_links:
        total_links.update(temp_links)

final_links = {item for item in total_links if item}

db_links = set(get_all_links())
new_links = final_links - db_links
final_links = new_links
final_links = set(final_links)

final_links = fix_links(final_links)

if __name__ == '__main__':

 for link in final_links:
    response = requests.get(link,headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    titles = soup.find_all(['h2', 'h1','h3'])
    title_text = ' '.join([title.get_text(strip=True) for title in titles])

    texts = soup.find_all(['p'])
    text_text = ' '.join([text.get_text(strip=True) for text in texts])

    text_text = text_text
    title_text = title_text
    
    title_text = replace_with_spaces(title_text)

    
    print(f"Tokens usage: {num_tokens_from_string(text_text, 'gpt-3.5-turbo')}")
    text_text = slice_text_at_2k_tokens(text_text)
    text_text = replace_with_spaces(str(text_text))
    
    try:
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "Data analytic, Journalist and News reporter"},
                {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
            ]
        )
        generated_text = completion.choices[0].message.content

        generated_text = generated_text

        response_data = json.loads(generated_text)
        
        title = response_data["title"]
        text = response_data["content"]

        #print("*********************************")
        #print(f"Title: {title}")
        #print("---------------------------------")
        #print(f"Content : {text}")
        #print("*********************************")


        vector = embeddings.embed_query(generated_text)
        
        if not is_similar_data(title, text, link, vector, threshold=0.98):
         similar_d = "NO"
         insert_data(title, text, link, vector,similar_d)

    except Exception as e:
        print(f"Error in completion: {e}")
        continue


def comb_similar():

    print("Checking similar")
    similar_article = get_similar()

    grouped_data = {}


    for sa in similar_article:
        if similar_article:
            first_t = get_specific_data(sa[0])
            second_t = get_specific_data(sa[1])
            link_f = first_t[0][2]
            link_s = second_t[0][2]
            f_text = first_t[0][1]
            s_text = second_t[0][1]
            f_title = first_t[0][0]
            s_title = second_t[0][0]

            if f_title in grouped_data:
                grouped_data[f_title].append((f_text, link_f))
            else:
                grouped_data[f_title] = [(f_text, link_f)]

            if s_title in grouped_data:
                  grouped_data[s_title].append((s_text, link_s))
            else:
                 grouped_data[s_title] = [(s_text, link_s)]

            for title, tuples in grouped_data.items():
                if len(tuples) == 3:
                    text1, link1 = tuples[0]
                    text2, link2 = tuples[1]
                    text3, link3 = tuples[2]

                    t1check = num_tokens_from_string(text1)
                    t2check = num_tokens_from_string(text2)
                    t3check = num_tokens_from_string(text3)
                    slice_if_more = t1check,t2check,t3check
                    if slice_if_more < 2000:
                        combined_text = f"{text1}{text2}{text3}"
                        combined_text = slice_text_at_2k_tokens(combined_text)
                        user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with 'content' field"
                        if link1 != link2 and link1 != link3 and link2 != link3:
                            link = f"{link1} {link2} {link3}"
                        else:
                            link = link1

                    else:
                        user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field."
                        if link1 != link2 and link1 != link3 and link2 != link3:
                            link = f"{link1} {link2} {link3}"
                        else:
                            link = link1
                else:
                    ftcheck = num_tokens_from_string(f_text)
                    stcheck = num_tokens_from_string(s_text)
                    fscomb = ftcheck + stcheck
                    if fscomb <2000:
                        combined_text = f"{f_text}{s_text}"
                        user_message = rf"Here is text {combined_text}, combined from 2 sources, filter text, and make news content, return as JSON only with 'content' field"
                        if link_f != link_s:
                            link = f"{link_f} {link_s}"
                        else:
                            link = link_f

                    else:
                        user_message = rf"Here are 2 texts {f_text} and {s_text}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field."
                        if link_f != link_s:
                            link = f"{link_f} {link_s}"
                        else:
                            link = link_f
            try:
                completion = client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": "Data analytic, Journalist and News reporter"},
                        {"role": "user", "content": user_message}
                    ]
                )
                generated_text = completion.choices[0].message.content

                if similar_article:
                    if f_title == s_title:
                        print(f_title)
                        modify_similar_data(first_t,"SOURCE")
                        similar_article.remove(sa)
                        print("Modified")
                    else:
                        print(f"First: {f_title}")
                        print(f"Second: {s_title}")
                        modify_similar_data(first_t,"SOURCE")
                        modify_similar_data(second_t,"SOURCE")
                        similar_article.remove(sa)
                        print("Modified")
                else:
                    print("Similar list is empty")

                response_data = json.loads(generated_text)
                title = f_title
                text = response_data["content"]

                vector = embeddings.embed_query(generated_text)

                if not is_similar_data(title, text, link, vector, threshold=0.98):
                    similar_d = "NO"
                    insert_data(title, text, link, vector, similar_d)

            except Exception as e:
                print(f"Error in completion: {e}")
                continue
Adding VDB 2023-12-25 12:31:55 +01:00			`from bs4 import BeautifulSoup`
			`import requests`
			`from urllib.parse import urljoin`
Combine similar article 2024-01-02 15:00:07 +01:00			`from openai import OpenAI , APIError`
Adding VDB 2023-12-25 12:31:55 +01:00			`import os`
			`from langchain.embeddings import OpenAIEmbeddings`
added article.py 2024-01-06 08:17:05 +01:00			`from vectData import (insert_data ,is_similar_data ,get_similar, get_specific_data,get_all_links,cleansing ,modify_similar_data)`
Adding VDB 2023-12-25 12:31:55 +01:00			`import json`
Combine similar article 2024-01-02 15:00:07 +01:00			`from dotenv import load_dotenv`
			`import tiktoken`
Adding VDB 2023-12-25 12:31:55 +01:00

Combine similar article 2024-01-02 15:00:07 +01:00			`load_dotenv()`
			`cleansing()`

			`OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")`

Adding VDB 2023-12-25 12:31:55 +01:00			`client = OpenAI()`
			`embeddings = OpenAIEmbeddings()`

			`dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']`
			`headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}`


Combine similar article 2024-01-02 15:00:07 +01:00
			`def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:`
			`encoding = tiktoken.encoding_for_model(model)`
			`return len(encoding.encode(string))`

			`def slice_text_at_2k_tokens(text):`
			`encoding_name = "gpt-3.5-turbo"`
			`max_tokens = 2000`

			`encoding = tiktoken.encoding_for_model(encoding_name)`
			`tokens = encoding.encode(text)`

			`if len(tokens) <= max_tokens:`
			`return [text]`

			`sliced_tokens = tokens[:max_tokens]`
			`sliced_text = encoding.decode(sliced_tokens)`

			`return sliced_text`


			`def replace_with_spaces(text):`
			`allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐđŠšŽž0123456789 "`
			`cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)`
			`return cleaned_text`

added article.py 2024-01-06 08:17:05 +01:00
			`def fix_links(links_set):`
			`modified_links = set()`

			`for link in links_set:`
			`if "www" in link:`
			`modified_link = link.replace("www.", "")`
			`modified_links.add(modified_link)`
			`else:`
			`modified_links.add(link)`

			`return modified_links`

Adding VDB 2023-12-25 12:31:55 +01:00			`total_links = set()`
			`collected_news = set()`

Combine similar article 2024-01-02 15:00:07 +01:00
Adding VDB 2023-12-25 12:31:55 +01:00			`def get_article_links(url, already_checked):`
			`response = requests.get(url,headers)`
			`if response.status_code == 200:`
			`soup = BeautifulSoup(response.text, 'html.parser')`
			`articles = soup.find_all('article')`
			`link_store = []`

			`for article in articles:`
			`links = article.find_all('a', href=True)`
			`for link in links:`
			`link_value = urljoin(url, link['href'])`
			`if link_value not in already_checked:`
			`link_store.append(link_value)`
			`already_checked.add(link_value)`
			`return link_store`

Combine similar article 2024-01-02 15:00:07 +01:00

Adding VDB 2023-12-25 12:31:55 +01:00			`already_checked = set()`

			`for dlink in dlinks:`
			`temp_links = get_article_links(dlink, already_checked)`
			`if temp_links:`
			`total_links.update(temp_links)`

			`final_links = {item for item in total_links if item}`
Combine similar article 2024-01-02 15:00:07 +01:00
			`db_links = set(get_all_links())`
			`new_links = final_links - db_links`
			`final_links = new_links`
added article.py 2024-01-06 08:17:05 +01:00			`final_links = set(final_links)`
Combine similar article 2024-01-02 15:00:07 +01:00
added article.py 2024-01-06 08:17:05 +01:00			`final_links = fix_links(final_links)`
Adding VDB 2023-12-25 12:31:55 +01:00
Combine similar article 2024-01-02 15:00:07 +01:00			`if __name__ == '__main__':`

			`for link in final_links:`
Adding VDB 2023-12-25 12:31:55 +01:00			`response = requests.get(link,headers)`
			`soup = BeautifulSoup(response.text, 'html.parser')`

			`titles = soup.find_all(['h2', 'h1','h3'])`
			`title_text = ' '.join([title.get_text(strip=True) for title in titles])`

			`texts = soup.find_all(['p'])`
			`text_text = ' '.join([text.get_text(strip=True) for text in texts])`
Combine similar article 2024-01-02 15:00:07 +01:00
			`text_text = text_text`
			`title_text = title_text`

			`title_text = replace_with_spaces(title_text)`


			`print(f"Tokens usage: {num_tokens_from_string(text_text, 'gpt-3.5-turbo')}")`
			`text_text = slice_text_at_2k_tokens(text_text)`
			`text_text = replace_with_spaces(str(text_text))`
Adding VDB 2023-12-25 12:31:55 +01:00
			`try:`
			`completion = client.chat.completions.create(`
			`model="gpt-3.5-turbo",`
			`messages=[`
			`{"role": "system", "content": "Data analytic, Journalist and News reporter"},`
			`{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}`
			`]`
			`)`
			`generated_text = completion.choices[0].message.content`

Combine similar article 2024-01-02 15:00:07 +01:00			`generated_text = generated_text`

Adding VDB 2023-12-25 12:31:55 +01:00			`response_data = json.loads(generated_text)`

			`title = response_data["title"]`
			`text = response_data["content"]`

Combine similar article 2024-01-02 15:00:07 +01:00			`#print("*********************************")`
			`#print(f"Title: {title}")`
			`#print("---------------------------------")`
			`#print(f"Content : {text}")`
			`#print("*********************************")`
Adding VDB 2023-12-25 12:31:55 +01:00

			`vector = embeddings.embed_query(generated_text)`

Combine similar article 2024-01-02 15:00:07 +01:00			`if not is_similar_data(title, text, link, vector, threshold=0.98):`
			`similar_d = "NO"`
			`insert_data(title, text, link, vector,similar_d)`

Adding VDB 2023-12-25 12:31:55 +01:00			`except Exception as e:`
			`print(f"Error in completion: {e}")`
			`continue`
Combine similar article 2024-01-02 15:00:07 +01:00
added article.py 2024-01-06 08:17:05 +01:00
Combine similar article 2024-01-02 15:00:07 +01:00			`def comb_similar():`

			`print("Checking similar")`
			`similar_article = get_similar()`

			`grouped_data = {}`


			`for sa in similar_article:`
			`if similar_article:`
			`first_t = get_specific_data(sa[0])`
			`second_t = get_specific_data(sa[1])`
			`link_f = first_t[0][2]`
			`link_s = second_t[0][2]`
			`f_text = first_t[0][1]`
			`s_text = second_t[0][1]`
			`f_title = first_t[0][0]`
			`s_title = second_t[0][0]`

			`if f_title in grouped_data:`
			`grouped_data[f_title].append((f_text, link_f))`
			`else:`
			`grouped_data[f_title] = [(f_text, link_f)]`

			`if s_title in grouped_data:`
			`grouped_data[s_title].append((s_text, link_s))`
			`else:`
			`grouped_data[s_title] = [(s_text, link_s)]`

			`for title, tuples in grouped_data.items():`
			`if len(tuples) == 3:`
			`text1, link1 = tuples[0]`
			`text2, link2 = tuples[1]`
			`text3, link3 = tuples[2]`

			`t1check = num_tokens_from_string(text1)`
			`t2check = num_tokens_from_string(text2)`
			`t3check = num_tokens_from_string(text3)`
			`slice_if_more = t1check,t2check,t3check`
			`if slice_if_more < 2000:`
			`combined_text = f"{text1}{text2}{text3}"`
			`combined_text = slice_text_at_2k_tokens(combined_text)`
			`user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with 'content' field"`
added article.py 2024-01-06 08:17:05 +01:00			`if link1 != link2 and link1 != link3 and link2 != link3:`
			`link = f"{link1} {link2} {link3}"`
			`else:`
			`link = link1`
Combine similar article 2024-01-02 15:00:07 +01:00
			`else:`
			`user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field."`
added article.py 2024-01-06 08:17:05 +01:00			`if link1 != link2 and link1 != link3 and link2 != link3:`
			`link = f"{link1} {link2} {link3}"`
			`else:`
			`link = link1`
Combine similar article 2024-01-02 15:00:07 +01:00			`else:`
			`ftcheck = num_tokens_from_string(f_text)`
			`stcheck = num_tokens_from_string(s_text)`
			`fscomb = ftcheck + stcheck`
			`if fscomb <2000:`
			`combined_text = f"{f_text}{s_text}"`
			`user_message = rf"Here is text {combined_text}, combined from 2 sources, filter text, and make news content, return as JSON only with 'content' field"`
added article.py 2024-01-06 08:17:05 +01:00			`if link_f != link_s:`
			`link = f"{link_f} {link_s}"`
			`else:`
			`link = link_f`
Combine similar article 2024-01-02 15:00:07 +01:00
			`else:`
			`user_message = rf"Here are 2 texts {f_text} and {s_text}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with 'content' field."`
added article.py 2024-01-06 08:17:05 +01:00			`if link_f != link_s:`
			`link = f"{link_f} {link_s}"`
			`else:`
			`link = link_f`
Combine similar article 2024-01-02 15:00:07 +01:00			`try:`
			`completion = client.chat.completions.create(`
			`model="gpt-3.5-turbo",`
			`messages=[`
			`{"role": "system", "content": "Data analytic, Journalist and News reporter"},`
			`{"role": "user", "content": user_message}`
			`]`
			`)`
			`generated_text = completion.choices[0].message.content`

			`if similar_article:`
			`if f_title == s_title:`
			`print(f_title)`
			`modify_similar_data(first_t,"SOURCE")`
			`similar_article.remove(sa)`
			`print("Modified")`
			`else:`
added article.py 2024-01-06 08:17:05 +01:00			`print(f"First: {f_title}")`
Combine similar article 2024-01-02 15:00:07 +01:00			`print(f"Second: {s_title}")`
			`modify_similar_data(first_t,"SOURCE")`
			`modify_similar_data(second_t,"SOURCE")`
			`similar_article.remove(sa)`
			`print("Modified")`
			`else:`
			`print("Similar list is empty")`

			`response_data = json.loads(generated_text)`
			`title = f_title`
			`text = response_data["content"]`

			`vector = embeddings.embed_query(generated_text)`

			`if not is_similar_data(title, text, link, vector, threshold=0.98):`
			`similar_d = "NO"`
			`insert_data(title, text, link, vector, similar_d)`

			`except Exception as e:`
			`print(f"Error in completion: {e}")`
			`continue`