SingleArticle test

This commit is contained in:
2023-12-19 14:26:12 +01:00
parent e052a0822c
commit 3359cc1ca9
3 changed files with 0 additions and 135 deletions

View File

@@ -1,27 +0,0 @@
import requests
from bs4 import BeautifulSoup
def getNews(url):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article', class_='news__item')
for index, article in enumerate(articles, start=1):
title = article.find('h2').text.strip()
content = article.find('p').text.strip()
category = article.find('span').text.strip()
print(f"{index}. Title: {title}")
print(f" Content: {content}")
print(f" Category: {category}")
print('****************************')
else:
print(f"Error. Status code: {response.status_code}")
if __name__ == "__main__":
pUrl = 'https://srpskainfo.com/sve-vijesti/'
getNews(pUrl)

View File

@@ -1,74 +0,0 @@
import psycopg2
from openai import OpenAI
from datetime import datetime, timedelta, timezone
client = OpenAI(api_key='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')
import spacy
nlp = spacy.load("en_core_web_sm")
twenty_minutes_ago_utc = datetime.now(timezone.utc) - timedelta(minutes=20)
db_params = {
'host': 'localhost',
'port': '5432',
'database': 'svevijestiweb',
'user': 'svevijesti',
'password': 'salmonela pljusti 221 hamo'
}
conn = psycopg2.connect(**db_params)
cursor = conn.cursor()
def convert_text_to_vector(text):
return nlp(text).vector
def check_similarity_with_gpt3(text1, text2):
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a text similarity assistant."},
{"role": "user", "content": f"Compare the similarity between the following two texts:\n\nText 1: {text1}\nText 2: {text2}\n\nSimilarity:"}
]
)
similarity_score = completion.choices[0].message.content
print("Analiza")
return similarity_score
cursor.execute("SELECT title FROM articles WHERE articles.created_at < %s", (twenty_minutes_ago_utc,))
data_from_database = cursor.fetchall()
for i in range(len(data_from_database)):
for j in range(i + 1, len(data_from_database)):
text1 = data_from_database[i][0]
text2 = data_from_database[j][0]
vector1 = convert_text_to_vector(text1)
vector2 = convert_text_to_vector(text2)
similarity_score = check_similarity_with_gpt3(vector1, vector2 )
print(similarity_score)
print("T1",text1)
print("T2", text2)
similarity_threshold = 0.8
if similarity_score > similarity_threshold:
try:
cursor.execute("DELETE FROM articles WHERE content = %s", (text2,))
conn.commit()
print(f"Deleted rows where title is {text2}")
except Exception as e:
conn.rollback() # Roll back changes if an error occurs
print(f"Error deleting rows: {e}")
cursor.close()
conn.close()

View File

@@ -1,34 +0,0 @@
import requests
from bs4 import BeautifulSoup
def getNews(url):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article', class_='news__item')
for index, article in enumerate(articles, start=1):
title = article.find('h2').text.strip()
content = article.find('p').text.strip()
category = article.find('span').text.strip()
slink = article.find('a')
if slink:
slink = slink.get('href', '')
else:
slink = ''
print(f"{index}. Title: {title}")
print(f" Content: {content}")
print(f" Category: {category}")
print(f"Link: {slink}")
print('****************************')
else:
print(f"Error. Status code: {response.status_code}")
if __name__ == "__main__":
pUrl = 'https://srpskainfo.com/sve-vijesti/'
getNews(pUrl)