SingleArticle test
This commit is contained in:
27
pyth/avaz.py
27
pyth/avaz.py
@@ -1,27 +0,0 @@
|
|||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
def getNews(url):
|
|
||||||
response = requests.get(url)
|
|
||||||
|
|
||||||
if response.status_code == 200:
|
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
|
||||||
|
|
||||||
articles = soup.find_all('article', class_='news__item')
|
|
||||||
|
|
||||||
for index, article in enumerate(articles, start=1):
|
|
||||||
title = article.find('h2').text.strip()
|
|
||||||
content = article.find('p').text.strip()
|
|
||||||
category = article.find('span').text.strip()
|
|
||||||
|
|
||||||
print(f"{index}. Title: {title}")
|
|
||||||
print(f" Content: {content}")
|
|
||||||
print(f" Category: {category}")
|
|
||||||
print('****************************')
|
|
||||||
else:
|
|
||||||
print(f"Error. Status code: {response.status_code}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
pUrl = 'https://srpskainfo.com/sve-vijesti/'
|
|
||||||
|
|
||||||
getNews(pUrl)
|
|
||||||
@@ -1,74 +0,0 @@
|
|||||||
import psycopg2
|
|
||||||
from openai import OpenAI
|
|
||||||
from datetime import datetime, timedelta, timezone
|
|
||||||
|
|
||||||
|
|
||||||
client = OpenAI(api_key='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')
|
|
||||||
import spacy
|
|
||||||
|
|
||||||
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
|
||||||
twenty_minutes_ago_utc = datetime.now(timezone.utc) - timedelta(minutes=20)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
db_params = {
|
|
||||||
'host': 'localhost',
|
|
||||||
'port': '5432',
|
|
||||||
'database': 'svevijestiweb',
|
|
||||||
'user': 'svevijesti',
|
|
||||||
'password': 'salmonela pljusti 221 hamo'
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
conn = psycopg2.connect(**db_params)
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
def convert_text_to_vector(text):
|
|
||||||
return nlp(text).vector
|
|
||||||
|
|
||||||
def check_similarity_with_gpt3(text1, text2):
|
|
||||||
completion = client.chat.completions.create(
|
|
||||||
model="gpt-3.5-turbo",
|
|
||||||
messages=[
|
|
||||||
{"role": "system", "content": "You are a text similarity assistant."},
|
|
||||||
{"role": "user", "content": f"Compare the similarity between the following two texts:\n\nText 1: {text1}\nText 2: {text2}\n\nSimilarity:"}
|
|
||||||
]
|
|
||||||
)
|
|
||||||
similarity_score = completion.choices[0].message.content
|
|
||||||
print("Analiza")
|
|
||||||
return similarity_score
|
|
||||||
|
|
||||||
cursor.execute("SELECT title FROM articles WHERE articles.created_at < %s", (twenty_minutes_ago_utc,))
|
|
||||||
data_from_database = cursor.fetchall()
|
|
||||||
|
|
||||||
|
|
||||||
for i in range(len(data_from_database)):
|
|
||||||
for j in range(i + 1, len(data_from_database)):
|
|
||||||
text1 = data_from_database[i][0]
|
|
||||||
text2 = data_from_database[j][0]
|
|
||||||
|
|
||||||
vector1 = convert_text_to_vector(text1)
|
|
||||||
vector2 = convert_text_to_vector(text2)
|
|
||||||
|
|
||||||
similarity_score = check_similarity_with_gpt3(vector1, vector2 )
|
|
||||||
print(similarity_score)
|
|
||||||
print("T1",text1)
|
|
||||||
print("T2", text2)
|
|
||||||
|
|
||||||
|
|
||||||
similarity_threshold = 0.8
|
|
||||||
|
|
||||||
if similarity_score > similarity_threshold:
|
|
||||||
try:
|
|
||||||
cursor.execute("DELETE FROM articles WHERE content = %s", (text2,))
|
|
||||||
conn.commit()
|
|
||||||
print(f"Deleted rows where title is {text2}")
|
|
||||||
except Exception as e:
|
|
||||||
conn.rollback() # Roll back changes if an error occurs
|
|
||||||
print(f"Error deleting rows: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cursor.close()
|
|
||||||
conn.close()
|
|
||||||
@@ -1,34 +0,0 @@
|
|||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
def getNews(url):
|
|
||||||
response = requests.get(url)
|
|
||||||
|
|
||||||
if response.status_code == 200:
|
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
|
||||||
|
|
||||||
articles = soup.find_all('article', class_='news__item')
|
|
||||||
|
|
||||||
for index, article in enumerate(articles, start=1):
|
|
||||||
title = article.find('h2').text.strip()
|
|
||||||
content = article.find('p').text.strip()
|
|
||||||
category = article.find('span').text.strip()
|
|
||||||
slink = article.find('a')
|
|
||||||
if slink:
|
|
||||||
slink = slink.get('href', '')
|
|
||||||
else:
|
|
||||||
slink = ''
|
|
||||||
|
|
||||||
|
|
||||||
print(f"{index}. Title: {title}")
|
|
||||||
print(f" Content: {content}")
|
|
||||||
print(f" Category: {category}")
|
|
||||||
print(f"Link: {slink}")
|
|
||||||
print('****************************')
|
|
||||||
else:
|
|
||||||
print(f"Error. Status code: {response.status_code}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
pUrl = 'https://srpskainfo.com/sve-vijesti/'
|
|
||||||
|
|
||||||
getNews(pUrl)
|
|
||||||
Reference in New Issue
Block a user