old-pitaj-ramizu/backend/add_events.py

import os
from dotenv import load_dotenv
import openai
import redis
import random
import requests
from bs4 import BeautifulSoup

openai.api_key = os.getenv("OPENAI_API_KEY")
redis_url = os.getenv("REDIS_URL")
redis_client = redis.from_url(redis_url)

def add_current_events():
    # If the key doesn't exist, extract titles from the URLs and filter unique titles
    urls = ['https://www.klix.ba', 'https://www.avaz.ba']
    titles = extract_titles(urls)
    unique_titles = filter_unique_titles(titles)
    # Convert the unique titles list to a string separated by newline
    todays_events_str = "\n".join(unique_titles)
    # Save the result to Redis with a 10-minute expiration time
    redis_client.set('todays_events', todays_events_str, ex=600)
    # return the result but split by newline, and then choose 7 random titles, and merge again into newline separated string
    return "\n".join(random.sample(todays_events_str.split("\n"), 7))


def extract_titles(urls):
    titles = []

    # Set the User-Agent to Chrome on Windows
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    for url in urls:
        try:
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')

            for tag in ['h1', 'h2', 'h3']:
                headers = soup.find_all(tag)

                for header in headers:
                    titles.append(header.text.strip())
        except Exception as e:
            print(f"Error processing URL {url}: {e}")

    return titles

def filter_out_titles_with_duplicate_meanings(titles):
    filtered_titles = []

    for title in titles:
        if title not in filtered_titles:
            filtered_titles.append(title)

    return filtered_titles


def filter_unique_titles(titles):
    # Prepare the prompt
    prompt = "Filter the following titles to include only unique topics, preferring longer titles when collisions are found:\n"
    for title in titles:
        prompt += f"- {title}\n"

    prompt += "Filtered unique titles:\n"


    print(prompt)
    # Call the GPT API
    response = openai.Completion.create(
        engine="text-davinci-002",
        prompt=prompt,
        max_tokens=1000,
        n=1,
        stop=None,
        temperature=0.7,
    )


    # Extract the filtered titles
    filtered_titles = response.choices[0].text.strip().split("\n")

    print(filtered_titles)
    # Clean up and return the titles
    return [title.strip() for title in filtered_titles if title.strip()]

if __name__ == '__main__':
    add_current_events()