90 lines
2.6 KiB
Python
90 lines
2.6 KiB
Python
import os
|
|
from dotenv import load_dotenv
|
|
import openai
|
|
import redis
|
|
import random
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
openai.api_key = os.getenv("OPENAI_API_KEY")
|
|
redis_url = os.getenv("REDIS_URL")
|
|
redis_client = redis.from_url(redis_url)
|
|
|
|
def add_current_events():
|
|
# If the key doesn't exist, extract titles from the URLs and filter unique titles
|
|
urls = ['https://www.klix.ba', 'https://www.avaz.ba']
|
|
titles = extract_titles(urls)
|
|
unique_titles = filter_unique_titles(titles)
|
|
# Convert the unique titles list to a string separated by newline
|
|
todays_events_str = "\n".join(unique_titles)
|
|
# Save the result to Redis with a 10-minute expiration time
|
|
redis_client.set('todays_events', todays_events_str, ex=600)
|
|
# return the result but split by newline, and then choose 7 random titles, and merge again into newline separated string
|
|
return "\n".join(random.sample(todays_events_str.split("\n"), 7))
|
|
|
|
|
|
|
|
def extract_titles(urls):
|
|
titles = []
|
|
|
|
# Set the User-Agent to Chrome on Windows
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
|
}
|
|
|
|
for url in urls:
|
|
try:
|
|
response = requests.get(url, headers=headers)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
for tag in ['h1', 'h2', 'h3']:
|
|
headers = soup.find_all(tag)
|
|
|
|
for header in headers:
|
|
titles.append(header.text.strip())
|
|
except Exception as e:
|
|
print(f"Error processing URL {url}: {e}")
|
|
|
|
return titles
|
|
|
|
def filter_out_titles_with_duplicate_meanings(titles):
|
|
filtered_titles = []
|
|
|
|
for title in titles:
|
|
if title not in filtered_titles:
|
|
filtered_titles.append(title)
|
|
|
|
return filtered_titles
|
|
|
|
|
|
def filter_unique_titles(titles):
|
|
# Prepare the prompt
|
|
prompt = "Filter the following titles to include only unique topics, preferring longer titles when collisions are found:\n"
|
|
for title in titles:
|
|
prompt += f"- {title}\n"
|
|
|
|
prompt += "Filtered unique titles:\n"
|
|
|
|
|
|
print(prompt)
|
|
# Call the GPT API
|
|
response = openai.Completion.create(
|
|
engine="text-davinci-002",
|
|
prompt=prompt,
|
|
max_tokens=1000,
|
|
n=1,
|
|
stop=None,
|
|
temperature=0.7,
|
|
)
|
|
|
|
|
|
|
|
# Extract the filtered titles
|
|
filtered_titles = response.choices[0].text.strip().split("\n")
|
|
|
|
print(filtered_titles)
|
|
# Clean up and return the titles
|
|
return [title.strip() for title in filtered_titles if title.strip()]
|
|
|
|
if __name__ == '__main__':
|
|
add_current_events() |