Add add events script

This commit is contained in:
Senad Uka
2023-05-15 17:15:30 +02:00
parent 4b93527998
commit 8a90dc9d62
2 changed files with 93 additions and 81 deletions

90
backend/add_events.py Normal file
View File

@@ -0,0 +1,90 @@
import os
from dotenv import load_dotenv
import openai
import redis
import random
import requests
from bs4 import BeautifulSoup
openai.api_key = os.getenv("OPENAI_API_KEY")
redis_url = os.getenv("REDIS_URL")
redis_client = redis.from_url(redis_url)
def add_current_events():
# If the key doesn't exist, extract titles from the URLs and filter unique titles
urls = ['https://www.klix.ba', 'https://www.avaz.ba']
titles = extract_titles(urls)
unique_titles = filter_unique_titles(titles)
# Convert the unique titles list to a string separated by newline
todays_events_str = "\n".join(unique_titles)
# Save the result to Redis with a 10-minute expiration time
redis_client.set('todays_events', todays_events_str, ex=600)
# return the result but split by newline, and then choose 7 random titles, and merge again into newline separated string
return "\n".join(random.sample(todays_events_str.split("\n"), 7))
def extract_titles(urls):
titles = []
# Set the User-Agent to Chrome on Windows
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
for url in urls:
try:
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
for tag in ['h1', 'h2', 'h3']:
headers = soup.find_all(tag)
for header in headers:
titles.append(header.text.strip())
except Exception as e:
print(f"Error processing URL {url}: {e}")
return titles
def filter_out_titles_with_duplicate_meanings(titles):
filtered_titles = []
for title in titles:
if title not in filtered_titles:
filtered_titles.append(title)
return filtered_titles
def filter_unique_titles(titles):
# Prepare the prompt
prompt = "Filter the following titles to include only unique topics, preferring longer titles when collisions are found:\n"
for title in titles:
prompt += f"- {title}\n"
prompt += "Filtered unique titles:\n"
print(prompt)
# Call the GPT API
response = openai.Completion.create(
engine="text-davinci-002",
prompt=prompt,
max_tokens=1000,
n=1,
stop=None,
temperature=0.7,
)
# Extract the filtered titles
filtered_titles = response.choices[0].text.strip().split("\n")
print(filtered_titles)
# Clean up and return the titles
return [title.strip() for title in filtered_titles if title.strip()]
if __name__ == '__main__':
add_current_events()

View File

@@ -22,6 +22,8 @@ cors = CORS(app, resources={
"http://pitajramizu.com",
"http://www.pitajramizu.com",
"https://c50a-77-77-231-127.ngrok-free.app"
"https://pitajramizu.com",
"https://www.pitajramizu.com",
]
}
})
@@ -98,70 +100,6 @@ def chat():
def extract_titles(urls):
titles = []
# Set the User-Agent to Chrome on Windows
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
for url in urls:
try:
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
for tag in ['h1', 'h2', 'h3']:
headers = soup.find_all(tag)
for header in headers:
titles.append(header.text.strip())
except Exception as e:
print(f"Error processing URL {url}: {e}")
return titles
def filter_out_titles_with_duplicate_meanings(titles):
filtered_titles = []
for title in titles:
if title not in filtered_titles:
filtered_titles.append(title)
return filtered_titles
def filter_unique_titles(titles):
# Prepare the prompt
prompt = "Filter the following titles to include only unique topics, preferring longer titles when collisions are found:\n"
for title in titles:
prompt += f"- {title}\n"
prompt += "Filtered unique titles:\n"
print(prompt)
# Call the GPT API
response = openai.Completion.create(
engine="text-davinci-002",
prompt=prompt,
max_tokens=1000,
n=1,
stop=None,
temperature=0.7,
)
# Extract the filtered titles
filtered_titles = response.choices[0].text.strip().split("\n")
print(filtered_titles)
# Clean up and return the titles
return [title.strip() for title in filtered_titles if title.strip()]
def get_todays_events():
# Check if the 'todays_events' key exists
todays_events = redis_client.get('todays_events')
@@ -170,23 +108,7 @@ def get_todays_events():
# If the key exists, return its value
return todays_events.decode('utf-8')
else:
# If the key doesn't exist, extract titles from the URLs and filter unique titles
urls = ['https://www.klix.ba', 'https://www.avaz.ba']
titles = extract_titles(urls)
unique_titles = filter_unique_titles(titles)
# Convert the unique titles list to a string separated by newline
todays_events_str = "\n".join(unique_titles)
# Save the result to Redis with a 10-minute expiration time
redis_client.set('todays_events', todays_events_str, ex=600)
# return the result but split by newline, and then choose 7 random titles, and merge again into newline separated string
return "\n".join(random.sample(todays_events_str.split("\n"), 7))
return ""
if __name__ == '__main__':