SingleArticle test
This commit is contained in:
27
pyth/avaz.py
27
pyth/avaz.py
@@ -1,27 +0,0 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def getNews(url):
|
||||
response = requests.get(url)
|
||||
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
articles = soup.find_all('article', class_='news__item')
|
||||
|
||||
for index, article in enumerate(articles, start=1):
|
||||
title = article.find('h2').text.strip()
|
||||
content = article.find('p').text.strip()
|
||||
category = article.find('span').text.strip()
|
||||
|
||||
print(f"{index}. Title: {title}")
|
||||
print(f" Content: {content}")
|
||||
print(f" Category: {category}")
|
||||
print('****************************')
|
||||
else:
|
||||
print(f"Error. Status code: {response.status_code}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
pUrl = 'https://srpskainfo.com/sve-vijesti/'
|
||||
|
||||
getNews(pUrl)
|
||||
@@ -1,74 +0,0 @@
|
||||
import psycopg2
|
||||
from openai import OpenAI
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
|
||||
client = OpenAI(api_key='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')
|
||||
import spacy
|
||||
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
twenty_minutes_ago_utc = datetime.now(timezone.utc) - timedelta(minutes=20)
|
||||
|
||||
|
||||
|
||||
db_params = {
|
||||
'host': 'localhost',
|
||||
'port': '5432',
|
||||
'database': 'svevijestiweb',
|
||||
'user': 'svevijesti',
|
||||
'password': 'salmonela pljusti 221 hamo'
|
||||
}
|
||||
|
||||
|
||||
conn = psycopg2.connect(**db_params)
|
||||
cursor = conn.cursor()
|
||||
|
||||
def convert_text_to_vector(text):
|
||||
return nlp(text).vector
|
||||
|
||||
def check_similarity_with_gpt3(text1, text2):
|
||||
completion = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a text similarity assistant."},
|
||||
{"role": "user", "content": f"Compare the similarity between the following two texts:\n\nText 1: {text1}\nText 2: {text2}\n\nSimilarity:"}
|
||||
]
|
||||
)
|
||||
similarity_score = completion.choices[0].message.content
|
||||
print("Analiza")
|
||||
return similarity_score
|
||||
|
||||
cursor.execute("SELECT title FROM articles WHERE articles.created_at < %s", (twenty_minutes_ago_utc,))
|
||||
data_from_database = cursor.fetchall()
|
||||
|
||||
|
||||
for i in range(len(data_from_database)):
|
||||
for j in range(i + 1, len(data_from_database)):
|
||||
text1 = data_from_database[i][0]
|
||||
text2 = data_from_database[j][0]
|
||||
|
||||
vector1 = convert_text_to_vector(text1)
|
||||
vector2 = convert_text_to_vector(text2)
|
||||
|
||||
similarity_score = check_similarity_with_gpt3(vector1, vector2 )
|
||||
print(similarity_score)
|
||||
print("T1",text1)
|
||||
print("T2", text2)
|
||||
|
||||
|
||||
similarity_threshold = 0.8
|
||||
|
||||
if similarity_score > similarity_threshold:
|
||||
try:
|
||||
cursor.execute("DELETE FROM articles WHERE content = %s", (text2,))
|
||||
conn.commit()
|
||||
print(f"Deleted rows where title is {text2}")
|
||||
except Exception as e:
|
||||
conn.rollback() # Roll back changes if an error occurs
|
||||
print(f"Error deleting rows: {e}")
|
||||
|
||||
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
||||
@@ -1,34 +0,0 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def getNews(url):
|
||||
response = requests.get(url)
|
||||
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
articles = soup.find_all('article', class_='news__item')
|
||||
|
||||
for index, article in enumerate(articles, start=1):
|
||||
title = article.find('h2').text.strip()
|
||||
content = article.find('p').text.strip()
|
||||
category = article.find('span').text.strip()
|
||||
slink = article.find('a')
|
||||
if slink:
|
||||
slink = slink.get('href', '')
|
||||
else:
|
||||
slink = ''
|
||||
|
||||
|
||||
print(f"{index}. Title: {title}")
|
||||
print(f" Content: {content}")
|
||||
print(f" Category: {category}")
|
||||
print(f"Link: {slink}")
|
||||
print('****************************')
|
||||
else:
|
||||
print(f"Error. Status code: {response.status_code}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
pUrl = 'https://srpskainfo.com/sve-vijesti/'
|
||||
|
||||
getNews(pUrl)
|
||||
Reference in New Issue
Block a user