import os from dotenv import load_dotenv import openai import redis import random import requests from bs4 import BeautifulSoup openai.api_key = os.getenv("OPENAI_API_KEY") redis_url = os.getenv("REDIS_URL") redis_client = redis.from_url(redis_url) def add_current_events(): # If the key doesn't exist, extract titles from the URLs and filter unique titles urls = ['https://www.klix.ba', 'https://www.avaz.ba'] titles = extract_titles(urls) unique_titles = filter_unique_titles(titles) # Convert the unique titles list to a string separated by newline todays_events_str = "\n".join(unique_titles) # Save the result to Redis with a 10-minute expiration time redis_client.set('todays_events', todays_events_str, ex=600) # return the result but split by newline, and then choose 7 random titles, and merge again into newline separated string return "\n".join(random.sample(todays_events_str.split("\n"), 7)) def extract_titles(urls): titles = [] # Set the User-Agent to Chrome on Windows headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' } for url in urls: try: response = requests.get(url, headers=headers) soup = BeautifulSoup(response.content, 'html.parser') for tag in ['h1', 'h2', 'h3']: headers = soup.find_all(tag) for header in headers: titles.append(header.text.strip()) except Exception as e: print(f"Error processing URL {url}: {e}") return titles def filter_out_titles_with_duplicate_meanings(titles): filtered_titles = [] for title in titles: if title not in filtered_titles: filtered_titles.append(title) return filtered_titles def filter_unique_titles(titles): # Prepare the prompt prompt = "Filter the following titles to include only unique topics, preferring longer titles when collisions are found:\n" for title in titles: prompt += f"- {title}\n" prompt += "Filtered unique titles:\n" print(prompt) # Call the GPT API response = openai.Completion.create( engine="text-davinci-002", prompt=prompt, max_tokens=1000, n=1, stop=None, temperature=0.7, ) # Extract the filtered titles filtered_titles = response.choices[0].text.strip().split("\n") print(filtered_titles) # Clean up and return the titles return [title.strip() for title in filtered_titles if title.strip()] if __name__ == '__main__': add_current_events()