pyth/scrapingsingle.py

from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
from openai import OpenAI
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector
from vectData import insert_data ,is_similar_data 
import json


os.environ["OPENAI_API_KEY"] = "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"
client = OpenAI()
embeddings = OpenAIEmbeddings()

dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}


total_links = set()
collected_news = set()

def get_article_links(url, already_checked):
    response = requests.get(url,headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('article')
        link_store = []

        for article in articles:
            links = article.find_all('a', href=True)
            for link in links:
                link_value = urljoin(url, link['href'])
                if link_value not in already_checked:
                    link_store.append(link_value)
                    already_checked.add(link_value)
        return link_store

already_checked = set()

for dlink in dlinks:
    temp_links = get_article_links(dlink, already_checked)
    if temp_links:
        total_links.update(temp_links)

final_links = {item for item in total_links if item}

for link in final_links:
    response = requests.get(link,headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    titles = soup.find_all(['h2', 'h1','h3'])
    title_text = ' '.join([title.get_text(strip=True) for title in titles])

    texts = soup.find_all(['p'])
    text_text = ' '.join([text.get_text(strip=True) for text in texts])
    
    try:
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "Data analytic, Journalist and News reporter"},
                {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
            ]
        )
        generated_text = completion.choices[0].message.content

        response_data = json.loads(generated_text)
        
        title = response_data["title"]
        text = response_data["content"]

        print("*********************************")
        print(f"Title: {title}")
        print("---------------------------------")
        print(f"Content : {text}")
        print("*********************************")


        vector = embeddings.embed_query(generated_text)

        if not is_similar_data(title, text, link, vector, threshold=0.9):
         insert_data(title, text, link, vector)
        
    except Exception as e:
        print(f"Error in completion: {e}")
        continue
Adding VDB 2023-12-25 12:31:55 +01:00			`from bs4 import BeautifulSoup`
			`import requests`
			`from urllib.parse import urljoin`
			`from openai import OpenAI`
			`import os`
			`from langchain.embeddings import OpenAIEmbeddings`
			`from langchain.vectorstores.pgvector import PGVector`
			`from vectData import insert_data ,is_similar_data`
			`import json`


			`os.environ["OPENAI_API_KEY"] = "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"`
			`client = OpenAI()`
			`embeddings = OpenAIEmbeddings()`

			`dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']`
			`headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}`


			`total_links = set()`
			`collected_news = set()`

			`def get_article_links(url, already_checked):`
			`response = requests.get(url,headers)`
			`if response.status_code == 200:`
			`soup = BeautifulSoup(response.text, 'html.parser')`
			`articles = soup.find_all('article')`
			`link_store = []`

			`for article in articles:`
			`links = article.find_all('a', href=True)`
			`for link in links:`
			`link_value = urljoin(url, link['href'])`
			`if link_value not in already_checked:`
			`link_store.append(link_value)`
			`already_checked.add(link_value)`
			`return link_store`

			`already_checked = set()`

			`for dlink in dlinks:`
			`temp_links = get_article_links(dlink, already_checked)`
			`if temp_links:`
			`total_links.update(temp_links)`

			`final_links = {item for item in total_links if item}`

			`for link in final_links:`
			`response = requests.get(link,headers)`
			`soup = BeautifulSoup(response.text, 'html.parser')`

			`titles = soup.find_all(['h2', 'h1','h3'])`
			`title_text = ' '.join([title.get_text(strip=True) for title in titles])`

			`texts = soup.find_all(['p'])`
			`text_text = ' '.join([text.get_text(strip=True) for text in texts])`

			`try:`
			`completion = client.chat.completions.create(`
			`model="gpt-3.5-turbo",`
			`messages=[`
			`{"role": "system", "content": "Data analytic, Journalist and News reporter"},`
			`{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}`
			`]`
			`)`
			`generated_text = completion.choices[0].message.content`

			`response_data = json.loads(generated_text)`

			`title = response_data["title"]`
			`text = response_data["content"]`

			`print("*********************************")`
			`print(f"Title: {title}")`
			`print("---------------------------------")`
			`print(f"Content : {text}")`
			`print("*********************************")`


			`vector = embeddings.embed_query(generated_text)`

			`if not is_similar_data(title, text, link, vector, threshold=0.9):`
			`insert_data(title, text, link, vector)`

			`except Exception as e:`
			`print(f"Error in completion: {e}")`
			`continue`