Add add events script
This commit is contained in:
90
backend/add_events.py
Normal file
90
backend/add_events.py
Normal file
@@ -0,0 +1,90 @@
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
import openai
|
||||
import redis
|
||||
import random
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
openai.api_key = os.getenv("OPENAI_API_KEY")
|
||||
redis_url = os.getenv("REDIS_URL")
|
||||
redis_client = redis.from_url(redis_url)
|
||||
|
||||
def add_current_events():
|
||||
# If the key doesn't exist, extract titles from the URLs and filter unique titles
|
||||
urls = ['https://www.klix.ba', 'https://www.avaz.ba']
|
||||
titles = extract_titles(urls)
|
||||
unique_titles = filter_unique_titles(titles)
|
||||
# Convert the unique titles list to a string separated by newline
|
||||
todays_events_str = "\n".join(unique_titles)
|
||||
# Save the result to Redis with a 10-minute expiration time
|
||||
redis_client.set('todays_events', todays_events_str, ex=600)
|
||||
# return the result but split by newline, and then choose 7 random titles, and merge again into newline separated string
|
||||
return "\n".join(random.sample(todays_events_str.split("\n"), 7))
|
||||
|
||||
|
||||
|
||||
def extract_titles(urls):
|
||||
titles = []
|
||||
|
||||
# Set the User-Agent to Chrome on Windows
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
||||
}
|
||||
|
||||
for url in urls:
|
||||
try:
|
||||
response = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
for tag in ['h1', 'h2', 'h3']:
|
||||
headers = soup.find_all(tag)
|
||||
|
||||
for header in headers:
|
||||
titles.append(header.text.strip())
|
||||
except Exception as e:
|
||||
print(f"Error processing URL {url}: {e}")
|
||||
|
||||
return titles
|
||||
|
||||
def filter_out_titles_with_duplicate_meanings(titles):
|
||||
filtered_titles = []
|
||||
|
||||
for title in titles:
|
||||
if title not in filtered_titles:
|
||||
filtered_titles.append(title)
|
||||
|
||||
return filtered_titles
|
||||
|
||||
|
||||
def filter_unique_titles(titles):
|
||||
# Prepare the prompt
|
||||
prompt = "Filter the following titles to include only unique topics, preferring longer titles when collisions are found:\n"
|
||||
for title in titles:
|
||||
prompt += f"- {title}\n"
|
||||
|
||||
prompt += "Filtered unique titles:\n"
|
||||
|
||||
|
||||
print(prompt)
|
||||
# Call the GPT API
|
||||
response = openai.Completion.create(
|
||||
engine="text-davinci-002",
|
||||
prompt=prompt,
|
||||
max_tokens=1000,
|
||||
n=1,
|
||||
stop=None,
|
||||
temperature=0.7,
|
||||
)
|
||||
|
||||
|
||||
|
||||
# Extract the filtered titles
|
||||
filtered_titles = response.choices[0].text.strip().split("\n")
|
||||
|
||||
print(filtered_titles)
|
||||
# Clean up and return the titles
|
||||
return [title.strip() for title in filtered_titles if title.strip()]
|
||||
|
||||
if __name__ == '__main__':
|
||||
add_current_events()
|
||||
@@ -22,6 +22,8 @@ cors = CORS(app, resources={
|
||||
"http://pitajramizu.com",
|
||||
"http://www.pitajramizu.com",
|
||||
"https://c50a-77-77-231-127.ngrok-free.app"
|
||||
"https://pitajramizu.com",
|
||||
"https://www.pitajramizu.com",
|
||||
]
|
||||
}
|
||||
})
|
||||
@@ -98,70 +100,6 @@ def chat():
|
||||
|
||||
|
||||
|
||||
|
||||
def extract_titles(urls):
|
||||
titles = []
|
||||
|
||||
# Set the User-Agent to Chrome on Windows
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
||||
}
|
||||
|
||||
for url in urls:
|
||||
try:
|
||||
response = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
for tag in ['h1', 'h2', 'h3']:
|
||||
headers = soup.find_all(tag)
|
||||
|
||||
for header in headers:
|
||||
titles.append(header.text.strip())
|
||||
except Exception as e:
|
||||
print(f"Error processing URL {url}: {e}")
|
||||
|
||||
return titles
|
||||
|
||||
def filter_out_titles_with_duplicate_meanings(titles):
|
||||
filtered_titles = []
|
||||
|
||||
for title in titles:
|
||||
if title not in filtered_titles:
|
||||
filtered_titles.append(title)
|
||||
|
||||
return filtered_titles
|
||||
|
||||
|
||||
def filter_unique_titles(titles):
|
||||
# Prepare the prompt
|
||||
prompt = "Filter the following titles to include only unique topics, preferring longer titles when collisions are found:\n"
|
||||
for title in titles:
|
||||
prompt += f"- {title}\n"
|
||||
|
||||
prompt += "Filtered unique titles:\n"
|
||||
|
||||
|
||||
print(prompt)
|
||||
# Call the GPT API
|
||||
response = openai.Completion.create(
|
||||
engine="text-davinci-002",
|
||||
prompt=prompt,
|
||||
max_tokens=1000,
|
||||
n=1,
|
||||
stop=None,
|
||||
temperature=0.7,
|
||||
)
|
||||
|
||||
|
||||
|
||||
# Extract the filtered titles
|
||||
filtered_titles = response.choices[0].text.strip().split("\n")
|
||||
|
||||
print(filtered_titles)
|
||||
# Clean up and return the titles
|
||||
return [title.strip() for title in filtered_titles if title.strip()]
|
||||
|
||||
|
||||
def get_todays_events():
|
||||
# Check if the 'todays_events' key exists
|
||||
todays_events = redis_client.get('todays_events')
|
||||
@@ -170,23 +108,7 @@ def get_todays_events():
|
||||
# If the key exists, return its value
|
||||
return todays_events.decode('utf-8')
|
||||
else:
|
||||
# If the key doesn't exist, extract titles from the URLs and filter unique titles
|
||||
urls = ['https://www.klix.ba', 'https://www.avaz.ba']
|
||||
titles = extract_titles(urls)
|
||||
unique_titles = filter_unique_titles(titles)
|
||||
|
||||
# Convert the unique titles list to a string separated by newline
|
||||
todays_events_str = "\n".join(unique_titles)
|
||||
|
||||
# Save the result to Redis with a 10-minute expiration time
|
||||
redis_client.set('todays_events', todays_events_str, ex=600)
|
||||
|
||||
# return the result but split by newline, and then choose 7 random titles, and merge again into newline separated string
|
||||
return "\n".join(random.sample(todays_events_str.split("\n"), 7))
|
||||
|
||||
|
||||
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
Reference in New Issue
Block a user