Adding VDB

2023-12-25 12:31:55 +01:00
parent 18c8fdee7d
commit 954ae97a96
3 changed files with 202 additions and 32 deletions
--- a/pyth/scrapingsingle.py
+++ b/pyth/scrapingsingle.py
@@ -0,0 +1,87 @@
 from bs4 import BeautifulSoup
 import requests
 from urllib.parse import urljoin
 from openai import OpenAI
 import os
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.vectorstores.pgvector import PGVector
 from vectData import insert_data ,is_similar_data 
 import json
 os.environ["OPENAI_API_KEY"] = "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"
 client = OpenAI()
 embeddings = OpenAIEmbeddings()
 dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
 headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
 total_links = set()
 collected_news = set()
 def get_article_links(url, already_checked):
    response = requests.get(url,headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('article')
        link_store = []
        for article in articles:
            links = article.find_all('a', href=True)
            for link in links:
                link_value = urljoin(url, link['href'])
                if link_value not in already_checked:
                    link_store.append(link_value)
                    already_checked.add(link_value)
        return link_store
 already_checked = set()
 for dlink in dlinks:
    temp_links = get_article_links(dlink, already_checked)
    if temp_links:
        total_links.update(temp_links)
 final_links = {item for item in total_links if item}
 for link in final_links:
    response = requests.get(link,headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    titles = soup.find_all(['h2', 'h1','h3'])
    title_text = ' '.join([title.get_text(strip=True) for title in titles])
    texts = soup.find_all(['p'])
    text_text = ' '.join([text.get_text(strip=True) for text in texts])
    try:
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "Data analytic, Journalist and News reporter"},
                {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
            ]
        )
        generated_text = completion.choices[0].message.content
        response_data = json.loads(generated_text)
        title = response_data["title"]
        text = response_data["content"]
        print("*********************************")
        print(f"Title: {title}")
        print("---------------------------------")
        print(f"Content : {text}")
        print("*********************************")
        vector = embeddings.embed_query(generated_text)
        if not is_similar_data(title, text, link, vector, threshold=0.9):
         insert_data(title, text, link, vector)
    except Exception as e:
        print(f"Error in completion: {e}")
        continue
--- a/pyth/singlearticle.py
+++ b/pyth/singlearticle.py
@@ -1,32 +0,0 @@
 import requests
 from openai import OpenAI
 import os
 from bs4 import BeautifulSoup
 os.environ["OPENAI_API_KEY"] = "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"
 client = OpenAI()
 urls = ['https://klix.ba/', 'https://srpskainfo.com/', 'https://bljesak.info/']
 for url in urls:
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup.find_all(['h2', 'p'])
    prompt_text = ''
    for tag in tags:
        text = tag.get_text(strip=True)
        prompt_text = prompt_text + text
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "Data analytic, Journalist and News reporter"},
            {"role": "user", "content": f"Extract for me evry title and full content for evry title from {prompt_text},without shortening,remove all thing that are not connected to news,  make it clear for reading"}
        ]
    )
    generated_text = completion.choices[0].message.content
    print(f"Text for {url}: \n {generated_text}\n")
--- a/pyth/vectData.py
+++ b/pyth/vectData.py
@@ -0,0 +1,115 @@
 import psycopg2
 from psycopg2 import sql
 from pgvector.psycopg2 import register_vector
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 host = 'localhost'
 port = '5432'
 user = 'postgres'
 password = 'salmonela pljusti 221 hamo'
 dbname = 'vector_svw'
 def calculate_cosine_similarity(v1, v2):
    v1_normalized = v1 / np.linalg.norm(v1)
    v2_normalized = v2 / np.linalg.norm(v2)
    similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
    return similarity
 def is_similar_data(title, text, link, embedding, threshold=0.9):
    conn = psycopg2.connect(
        host=host,
        port=port,
        user=user,
        password=password,
        dbname=dbname
    )
    cursor = conn.cursor()
    cursor.execute('SELECT title ,embedding FROM vectorsvevijesti;')
    existing_embeddings = cursor.fetchall()
    for existing_embedding_tuple in existing_embeddings:
        existing_title = existing_embedding_tuple[0]
        existing_embedding = np.array(existing_embedding_tuple[1]).flatten()
        similarity = calculate_cosine_similarity(existing_embedding, embedding)
        if similarity > threshold:
            print(f"Similar data found: \n #{title} \n #{existing_title}")
            cursor.close()
            conn.close()
            return True
    print(f"Inserting: #{title}")
    cursor.close()
    conn.close()
    return False
 def insert_data(title, text, link, embedding):
    conn = psycopg2.connect(
        host=host,
        port=port,
        user=user,
        password=password,
        dbname=dbname
    )
    cursor = conn.cursor()
    cursor.execute('''
        INSERT INTO vectorsvevijesti (title, text, link, embedding)
        VALUES (%s, %s, %s, %s);
    ''', (title, text, link, embedding))
    conn.commit()
    cursor.close()
    conn.close()
 def get_data():
    conn = psycopg2.connect(
        host=host,
        port=port,
        user=user,
        password=password,
        dbname=dbname
    )
    cursor = conn.cursor()
    query = '''SELECT title,text,link FROM vectorsvevijesti;'''
    cursor.execute(query)
    data = cursor.fetchall()
    cursor.close()
    conn.close()
    return data
 def create_db():
    conn = psycopg2.connect(
        host=host,
        port=port,
        user=user,
        password=password,
        dbname=dbname
    )
    cursor = conn.cursor()
    cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
    register_vector(conn)
    cursor.execute("DROP TABLE IF EXISTS vectorsvevijesti;")
    cursor.execute('''
        CREATE TABLE vectorsvevijesti (
            id bigserial PRIMARY KEY,
            title VARCHAR,
            text VARCHAR,
            link VARCHAR,
            embedding vector(1536)
        );
    ''')
    conn.commit()
    cursor.close()
    conn.close()
 create_db()