From 8a90dc9d62a1d5fe31ec4c390ac11db923c1282f Mon Sep 17 00:00:00 2001 From: Senad Uka Date: Mon, 15 May 2023 17:15:30 +0200 Subject: [PATCH] Add add events script --- backend/add_events.py | 90 +++++++++++++++++++++++++++++++++++++++++++ backend/app.py | 84 ++-------------------------------------- 2 files changed, 93 insertions(+), 81 deletions(-) create mode 100644 backend/add_events.py diff --git a/backend/add_events.py b/backend/add_events.py new file mode 100644 index 0000000..49964c2 --- /dev/null +++ b/backend/add_events.py @@ -0,0 +1,90 @@ +import os +from dotenv import load_dotenv +import openai +import redis +import random +import requests +from bs4 import BeautifulSoup + +openai.api_key = os.getenv("OPENAI_API_KEY") +redis_url = os.getenv("REDIS_URL") +redis_client = redis.from_url(redis_url) + +def add_current_events(): + # If the key doesn't exist, extract titles from the URLs and filter unique titles + urls = ['https://www.klix.ba', 'https://www.avaz.ba'] + titles = extract_titles(urls) + unique_titles = filter_unique_titles(titles) + # Convert the unique titles list to a string separated by newline + todays_events_str = "\n".join(unique_titles) + # Save the result to Redis with a 10-minute expiration time + redis_client.set('todays_events', todays_events_str, ex=600) + # return the result but split by newline, and then choose 7 random titles, and merge again into newline separated string + return "\n".join(random.sample(todays_events_str.split("\n"), 7)) + + + +def extract_titles(urls): + titles = [] + + # Set the User-Agent to Chrome on Windows + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' + } + + for url in urls: + try: + response = requests.get(url, headers=headers) + soup = BeautifulSoup(response.content, 'html.parser') + + for tag in ['h1', 'h2', 'h3']: + headers = soup.find_all(tag) + + for header in headers: + titles.append(header.text.strip()) + except Exception as e: + print(f"Error processing URL {url}: {e}") + + return titles + +def filter_out_titles_with_duplicate_meanings(titles): + filtered_titles = [] + + for title in titles: + if title not in filtered_titles: + filtered_titles.append(title) + + return filtered_titles + + +def filter_unique_titles(titles): + # Prepare the prompt + prompt = "Filter the following titles to include only unique topics, preferring longer titles when collisions are found:\n" + for title in titles: + prompt += f"- {title}\n" + + prompt += "Filtered unique titles:\n" + + + print(prompt) + # Call the GPT API + response = openai.Completion.create( + engine="text-davinci-002", + prompt=prompt, + max_tokens=1000, + n=1, + stop=None, + temperature=0.7, + ) + + + + # Extract the filtered titles + filtered_titles = response.choices[0].text.strip().split("\n") + + print(filtered_titles) + # Clean up and return the titles + return [title.strip() for title in filtered_titles if title.strip()] + +if __name__ == '__main__': + add_current_events() \ No newline at end of file diff --git a/backend/app.py b/backend/app.py index 9ac5eea..764c1e0 100644 --- a/backend/app.py +++ b/backend/app.py @@ -22,6 +22,8 @@ cors = CORS(app, resources={ "http://pitajramizu.com", "http://www.pitajramizu.com", "https://c50a-77-77-231-127.ngrok-free.app" + "https://pitajramizu.com", + "https://www.pitajramizu.com", ] } }) @@ -98,70 +100,6 @@ def chat(): - -def extract_titles(urls): - titles = [] - - # Set the User-Agent to Chrome on Windows - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' - } - - for url in urls: - try: - response = requests.get(url, headers=headers) - soup = BeautifulSoup(response.content, 'html.parser') - - for tag in ['h1', 'h2', 'h3']: - headers = soup.find_all(tag) - - for header in headers: - titles.append(header.text.strip()) - except Exception as e: - print(f"Error processing URL {url}: {e}") - - return titles - -def filter_out_titles_with_duplicate_meanings(titles): - filtered_titles = [] - - for title in titles: - if title not in filtered_titles: - filtered_titles.append(title) - - return filtered_titles - - -def filter_unique_titles(titles): - # Prepare the prompt - prompt = "Filter the following titles to include only unique topics, preferring longer titles when collisions are found:\n" - for title in titles: - prompt += f"- {title}\n" - - prompt += "Filtered unique titles:\n" - - - print(prompt) - # Call the GPT API - response = openai.Completion.create( - engine="text-davinci-002", - prompt=prompt, - max_tokens=1000, - n=1, - stop=None, - temperature=0.7, - ) - - - - # Extract the filtered titles - filtered_titles = response.choices[0].text.strip().split("\n") - - print(filtered_titles) - # Clean up and return the titles - return [title.strip() for title in filtered_titles if title.strip()] - - def get_todays_events(): # Check if the 'todays_events' key exists todays_events = redis_client.get('todays_events') @@ -170,23 +108,7 @@ def get_todays_events(): # If the key exists, return its value return todays_events.decode('utf-8') else: - # If the key doesn't exist, extract titles from the URLs and filter unique titles - urls = ['https://www.klix.ba', 'https://www.avaz.ba'] - titles = extract_titles(urls) - unique_titles = filter_unique_titles(titles) - - # Convert the unique titles list to a string separated by newline - todays_events_str = "\n".join(unique_titles) - - # Save the result to Redis with a 10-minute expiration time - redis_client.set('todays_events', todays_events_str, ex=600) - - # return the result but split by newline, and then choose 7 random titles, and merge again into newline separated string - return "\n".join(random.sample(todays_events_str.split("\n"), 7)) - - - - + return "" if __name__ == '__main__':