Add add events script

2023-05-15 17:15:30 +02:00
parent 4b93527998
commit 8a90dc9d62
2 changed files with 93 additions and 81 deletions
--- a/backend/add_events.py
+++ b/backend/add_events.py
@@ -0,0 +1,90 @@
 import os
 from dotenv import load_dotenv
 import openai
 import redis
 import random
 import requests
 from bs4 import BeautifulSoup
 openai.api_key = os.getenv("OPENAI_API_KEY")
 redis_url = os.getenv("REDIS_URL")
 redis_client = redis.from_url(redis_url)
 def add_current_events():
    # If the key doesn't exist, extract titles from the URLs and filter unique titles
    urls = ['https://www.klix.ba', 'https://www.avaz.ba']
    titles = extract_titles(urls)
    unique_titles = filter_unique_titles(titles)
    # Convert the unique titles list to a string separated by newline
    todays_events_str = "\n".join(unique_titles)
    # Save the result to Redis with a 10-minute expiration time
    redis_client.set('todays_events', todays_events_str, ex=600)
    # return the result but split by newline, and then choose 7 random titles, and merge again into newline separated string
    return "\n".join(random.sample(todays_events_str.split("\n"), 7))
 def extract_titles(urls):
    titles = []
    # Set the User-Agent to Chrome on Windows
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    for url in urls:
        try:
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            for tag in ['h1', 'h2', 'h3']:
                headers = soup.find_all(tag)
                for header in headers:
                    titles.append(header.text.strip())
        except Exception as e:
            print(f"Error processing URL {url}: {e}")
    return titles
 def filter_out_titles_with_duplicate_meanings(titles):
    filtered_titles = []
    for title in titles:
        if title not in filtered_titles:
            filtered_titles.append(title)
    return filtered_titles
 def filter_unique_titles(titles):
    # Prepare the prompt
    prompt = "Filter the following titles to include only unique topics, preferring longer titles when collisions are found:\n"
    for title in titles:
        prompt += f"- {title}\n"
    prompt += "Filtered unique titles:\n"
    print(prompt)
    # Call the GPT API
    response = openai.Completion.create(
        engine="text-davinci-002",
        prompt=prompt,
        max_tokens=1000,
        n=1,
        stop=None,
        temperature=0.7,
    )
    # Extract the filtered titles
    filtered_titles = response.choices[0].text.strip().split("\n")
    print(filtered_titles)
    # Clean up and return the titles
    return [title.strip() for title in filtered_titles if title.strip()]
 if __name__ == '__main__':
    add_current_events()
--- a/backend/app.py
+++ b/backend/app.py
@@ -22,6 +22,8 @@ cors = CORS(app, resources={
            "http://pitajramizu.com",
            "http://www.pitajramizu.com",
            "https://c50a-77-77-231-127.ngrok-free.app"
            "https://pitajramizu.com",
            "https://www.pitajramizu.com",
        ]
    }
 })
@@ -98,70 +100,6 @@ def chat():
 def extract_titles(urls):
    titles = []
    # Set the User-Agent to Chrome on Windows
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    for url in urls:
        try:
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            for tag in ['h1', 'h2', 'h3']:
                headers = soup.find_all(tag)
                for header in headers:
                    titles.append(header.text.strip())
        except Exception as e:
            print(f"Error processing URL {url}: {e}")
    return titles
 def filter_out_titles_with_duplicate_meanings(titles):
    filtered_titles = []
    for title in titles:
        if title not in filtered_titles:
            filtered_titles.append(title)
    return filtered_titles
 def filter_unique_titles(titles):
    # Prepare the prompt
    prompt = "Filter the following titles to include only unique topics, preferring longer titles when collisions are found:\n"
    for title in titles:
        prompt += f"- {title}\n"
    prompt += "Filtered unique titles:\n"
    print(prompt)
    # Call the GPT API
    response = openai.Completion.create(
        engine="text-davinci-002",
        prompt=prompt,
        max_tokens=1000,
        n=1,
        stop=None,
        temperature=0.7,
    )
    # Extract the filtered titles
    filtered_titles = response.choices[0].text.strip().split("\n")
    print(filtered_titles)
    # Clean up and return the titles
    return [title.strip() for title in filtered_titles if title.strip()]
 def get_todays_events():
    # Check if the 'todays_events' key exists
    todays_events = redis_client.get('todays_events')
@@ -170,23 +108,7 @@ def get_todays_events():
        # If the key exists, return its value
        return todays_events.decode('utf-8')
    else:
-        # If the key doesn't exist, extract titles from the URLs and filter unique titles
+        return ""
        urls = ['https://www.klix.ba', 'https://www.avaz.ba']
        titles = extract_titles(urls)
        unique_titles = filter_unique_titles(titles)
        # Convert the unique titles list to a string separated by newline
        todays_events_str = "\n".join(unique_titles)
        # Save the result to Redis with a 10-minute expiration time
        redis_client.set('todays_events', todays_events_str, ex=600)
        # return the result but split by newline, and then choose 7 random titles, and merge again into newline separated string
        return "\n".join(random.sample(todays_events_str.split("\n"), 7))
 if __name__ == '__main__':