Adding VDB

2023-12-25 12:31:55 +01:00
parent 18c8fdee7d
commit 954ae97a96
3 changed files with 202 additions and 32 deletions
--- a/pyth/scrapingsingle.py
+++ b/pyth/scrapingsingle.py
@@ -0,0 +1,87 @@
+from bs4 import BeautifulSoup
+import requests
+from urllib.parse import urljoin
+from openai import OpenAI
+import os
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores.pgvector import PGVector
+from vectData import insert_data ,is_similar_data 
+import json
+
+
+os.environ["OPENAI_API_KEY"] = "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"
+client = OpenAI()
+embeddings = OpenAIEmbeddings()
+
+dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
+headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
+
+
+total_links = set()
+collected_news = set()
+
+def get_article_links(url, already_checked):
+    response = requests.get(url,headers)
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.text, 'html.parser')
+        articles = soup.find_all('article')
+        link_store = []
+
+        for article in articles:
+            links = article.find_all('a', href=True)
+            for link in links:
+                link_value = urljoin(url, link['href'])
+                if link_value not in already_checked:
+                    link_store.append(link_value)
+                    already_checked.add(link_value)
+        return link_store
+
+already_checked = set()
+
+for dlink in dlinks:
+    temp_links = get_article_links(dlink, already_checked)
+    if temp_links:
+        total_links.update(temp_links)
+
+final_links = {item for item in total_links if item}
+
+for link in final_links:
+    response = requests.get(link,headers)
+    soup = BeautifulSoup(response.text, 'html.parser')
+
+    titles = soup.find_all(['h2', 'h1','h3'])
+    title_text = ' '.join([title.get_text(strip=True) for title in titles])
+
+    texts = soup.find_all(['p'])
+    text_text = ' '.join([text.get_text(strip=True) for text in texts])
+    
+    try:
+        completion = client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[
+                {"role": "system", "content": "Data analytic, Journalist and News reporter"},
+                {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
+            ]
+        )
+        generated_text = completion.choices[0].message.content
+
+        response_data = json.loads(generated_text)
+        
+        title = response_data["title"]
+        text = response_data["content"]
+
+        print("*********************************")
+        print(f"Title: {title}")
+        print("---------------------------------")
+        print(f"Content : {text}")
+        print("*********************************")
+
+
+        vector = embeddings.embed_query(generated_text)
+
+        if not is_similar_data(title, text, link, vector, threshold=0.9):
+         insert_data(title, text, link, vector)
+        
+    except Exception as e:
+        print(f"Error in completion: {e}")
+        continue