Test Title 1
-Test Text 1
- First -diff --git a/cmd/web/svevijesti b/cmd/web/svevijesti new file mode 100644 index 0000000..f40e138 Binary files /dev/null and b/cmd/web/svevijesti differ diff --git a/internal/database/articles.go b/internal/database/articles.go index c85aeda..bd96180 100644 --- a/internal/database/articles.go +++ b/internal/database/articles.go @@ -2,12 +2,13 @@ package database import ( "fmt" - _ "github.com/lib/pq" - "gitlab.com/kbr4/svevijesti/internal/model" "html/template" "math" "strings" "time" + + _ "github.com/lib/pq" + "gitlab.com/kbr4/svevijesti/internal/model" ) func InsertArticle(store *Store, article model.ScrapedArticle) (err error) { @@ -186,3 +187,48 @@ func PreviousAndNextArticleUrlByID(store *Store, ID int) (nextUrl string, previo return nextResult, previousResult, nil } + +func ArticleByCategory(store *Store, day time.Time) (cateogry []model.DisplayArticle, err error) { + + result := []model.DisplayArticle{} + query, err := store.Prepare(`select id,title,content,slug,created_at,source_id,category from articles where created_at > $1 and created_at < $2 and LENGTH(content) > 10 order by id desc;`) + if err != nil { + return result, err + } + defer query.Close() + + tomorow := day.AddDate(0, 0, 1) + todayDate := day.Format("2024-01-26") + tomorrowDate := tomorow.Format("2024:01:26") + + rows, err := query.Query(todayDate, tomorrowDate) + if err != nil { + return result, err + } + defer rows.Close() + + for rows.Next() { + r := model.DisplayArticle{} + err = rows.Scan(&r.ID, &r.Title, &r.Content, &r.Slug, &r.CreatedAt, &r.OriginalUrl, &r.SourceId, &r.CreatedAt, &r.Category) + if err != nil { + return result, err + } + + ago := time.Now().Sub(r.CreatedAt) + hours := ago.Hours() + + if hours < 1 { + r.FormatedCreatedAt = fmt.Sprintf("Prije %d sati.", int(math.Floor(ago.Minutes()))) + } else if hours > 24 { + r.FormatedCreatedAt = r.CreatedAt.Format("28.01.2024. 01:03:05") + } else { + r.FormatedCreatedAt = fmt.Sprintf("Prije %d sati.", int(math.Floor(ago.Minutes()))) + } + + r.SourceName = model.SourceName(r.SourceId) + + result = append(result, r) + } + return result, nil + +} diff --git a/internal/model/model.go b/internal/model/model.go index c085bdf..20f3477 100644 --- a/internal/model/model.go +++ b/internal/model/model.go @@ -23,6 +23,7 @@ type DisplayArticle struct { CreatedAt time.Time FormatedCreatedAt string SourceName string + Category string } const ( diff --git a/internal/server/articles.go b/internal/server/articles.go index d64ffab..e67f098 100644 --- a/internal/server/articles.go +++ b/internal/server/articles.go @@ -2,11 +2,12 @@ package server import ( "fmt" - "github.com/gorilla/mux" - "gitlab.com/kbr4/svevijesti/internal/database" "net/http" "strconv" "time" + + "github.com/gorilla/mux" + "gitlab.com/kbr4/svevijesti/internal/database" ) func rootHandler(wr http.ResponseWriter, req *http.Request) { diff --git a/internal/server/category.go b/internal/server/category.go new file mode 100644 index 0000000..9c2c6f3 --- /dev/null +++ b/internal/server/category.go @@ -0,0 +1,34 @@ +package server + +import ( + "net/http" + "time" + + "gitlab.com/kbr4/svevijesti/internal/database" + "gitlab.com/kbr4/svevijesti/internal/model" +) + +func categoryHandler(wr http.ResponseWriter, r *http.Request) { + + store, err := database.Connect() + + if err != nil { + http.Error(wr, err.Error(), http.StatusInternalServerError) + } + + articles, err := database.ArticleByCategory(store, time.Now()) + + articlesByCategory := make(map[string][]model.DisplayArticle) + + for _, article := range articles { + articlesByCategory[article.Category] = append(articlesByCategory[article.Category], article) + } + + for category, articles := range articlesByCategory { + data := map[string]interface{}{"Category": category, "Articles": articles} + err := templates.ExecuteTemplate(wr, "categoryHTML", data) + if err != nil { + panic(err) + } + } +} diff --git a/internal/server/server.go b/internal/server/server.go index 9d7b07b..ae07f3b 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -2,11 +2,12 @@ package server import ( "fmt" - "github.com/gorilla/mux" "html/template" "io/ioutil" "path/filepath" "strings" + + "github.com/gorilla/mux" ) var tPath = "./web/tpl/" @@ -42,5 +43,7 @@ func CreateRoutes() *mux.Router { r.HandleFunc("/dan/{date}", dailyArticlesHandler) r.HandleFunc("/{id:[0-9]+}/{slug}", articleHandler) r.HandleFunc("/", rootHandler) + r.HandleFunc("/weather", WeatherHandler) + r.HandleFunc("/{category}", categoryHandler) return r } diff --git a/internal/server/weather.go b/internal/server/weather.go new file mode 100644 index 0000000..1ee43fe --- /dev/null +++ b/internal/server/weather.go @@ -0,0 +1,97 @@ +package server + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "net/http" +) + +const apiKey = "abb35e21bdcbad6d1b00141a2b25cf5a" + +type WeatherData struct { + Coord struct { + Lat float64 `json:"lat"` + Lon float64 `json:"lon"` + } `json:"coord"` + Weather []struct { + Description string `json:"description"` + Icon string `json:"icon"` + } `json:"weather"` + Main struct { + Temp float64 `json:"temp"` + FellsLike float64 `json:"fells_like"` + Preassure int `json:"preassure"` + Humidity int `json:"humidity"` + TempMin float64 `json:"temp_min"` + TempMax float64 `json:"temp_max"` + } `json:"main"` + Wind struct { + Speed float64 `json:"speed"` + Deg float64 `json:"deg"` + } `json:"wind"` + Clouds struct { + All int `json:"all"` + } `json:"clouds"` + Name string `json:"name"` +} + +func getWeather(city string) (WeatherData, error) { + url := fmt.Sprintf("http://api.openweathermap.org/data/2.5/weather?q=%s&appid=%s&units=metric&lang=hr", city, apiKey) + + resp, err := http.Get(url) + if err != nil { + return WeatherData{}, err + } + defer resp.Body.Close() + + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + return WeatherData{}, err + } + var weatherData WeatherData + err = json.Unmarshal(body, &weatherData) + if err != nil { + return WeatherData{}, err + } + return weatherData, nil +} + +func WeatherHandler(w http.ResponseWriter, r *http.Request) { + cities := []string{"Sarajevo", "Banja Luka", "Zenica", "Tuzla", "Mostar"} + + var weatherInfo []WeatherData + for _, city := range cities { + data, err := getWeather(city) + if err != nil { + fmt.Printf("Error fetching weather for %s: %v\n", city, err) + continue + } + weatherInfo = append(weatherInfo, data) + } + + title := "Vremenska Prognoza" + data := map[string]interface{}{ + "title": title, + "weatherInfo": weatherInfo, + } + + err := templates.ExecuteTemplate(w, "fullweatherHTML", data) + if err != nil { + fmt.Println("Error executing template:", err) + http.Error(w, "Internal Server Error", http.StatusInternalServerError) + } + for _, info := range weatherInfo { + widgetData := map[string]interface{}{ + "Temperature": info.Main.Temp, + "City": info.Name, + } + err := templates.ExecuteTemplate(w, "weatherwidgetHTML", widgetData) + if err != nil { + fmt.Println("Error executing template:", err) + http.Error(w, "Internal Server Error", http.StatusInternalServerError) + return + } + } + +} diff --git a/pyth/.env b/pyth/.env index c213e8f..7051493 100644 --- a/pyth/.env +++ b/pyth/.env @@ -2,6 +2,6 @@ OPENAI_API_KEY = "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" DB_HOST =localhost DB_PORT =5432 -DB_USER =postgres +DB_USER =svevijesti DB_PASSWORD =salmonela pljusti 221 hamo DB_NAME =svevijestiweb \ No newline at end of file diff --git a/pyth/__pycache__/db_management.cpython-310.pyc b/pyth/__pycache__/db_management.cpython-310.pyc new file mode 100644 index 0000000..1d63854 Binary files /dev/null and b/pyth/__pycache__/db_management.cpython-310.pyc differ diff --git a/pyth/__pycache__/get_articles.cpython-310.pyc b/pyth/__pycache__/get_articles.cpython-310.pyc new file mode 100644 index 0000000..a788959 Binary files /dev/null and b/pyth/__pycache__/get_articles.cpython-310.pyc differ diff --git a/pyth/__pycache__/publishing_finals.cpython-310.pyc b/pyth/__pycache__/publishing_finals.cpython-310.pyc new file mode 100644 index 0000000..1517ce8 Binary files /dev/null and b/pyth/__pycache__/publishing_finals.cpython-310.pyc differ diff --git a/pyth/__pycache__/scrapingsingle.cpython-310.pyc b/pyth/__pycache__/scrapingsingle.cpython-310.pyc index b39ce5c..67d1d59 100644 Binary files a/pyth/__pycache__/scrapingsingle.cpython-310.pyc and b/pyth/__pycache__/scrapingsingle.cpython-310.pyc differ diff --git a/pyth/__pycache__/tttt.cpython-310.pyc b/pyth/__pycache__/tttt.cpython-310.pyc new file mode 100644 index 0000000..0a3bfcd Binary files /dev/null and b/pyth/__pycache__/tttt.cpython-310.pyc differ diff --git a/pyth/__pycache__/vectData.cpython-310.pyc b/pyth/__pycache__/vectData.cpython-310.pyc index e806a8a..340968f 100644 Binary files a/pyth/__pycache__/vectData.cpython-310.pyc and b/pyth/__pycache__/vectData.cpython-310.pyc differ diff --git a/pyth/articles.py b/pyth/articles.py deleted file mode 100644 index b5ae49f..0000000 --- a/pyth/articles.py +++ /dev/null @@ -1,241 +0,0 @@ -import psycopg2 -import numpy as np -from dotenv import load_dotenv -import os -from openai import OpenAI -from langchain.embeddings import OpenAIEmbeddings -from vectData import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity,get_titles_links_embeddings -from scrapingsingle import num_tokens_from_string, slice_text_at_2k_tokens -import json -from json_repair import repair_json - -load_dotenv() - -OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") -client = OpenAI() -embeddings = OpenAIEmbeddings() - -print(f"Checking for similar!") - -def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95): - try: - titles, links, embeddings = get_titles_links_embeddings() - - processed_articles = set() - grouped_similar_articles = [] - - for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)): - if (title1, link1) not in processed_articles: - processed_articles.add((title1, link1)) - group = [(title1, link1)] - - for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)): - if i != j and (title2, link2) not in processed_articles: - similarity = calculate_cosine_similarity(embedding1, embedding2) - - if similarity > threshold: - processed_articles.add((title2, link2)) - group.append((title2, link2)) - - grouped_similar_articles.append(group) - - return grouped_similar_articles - - except psycopg2.Error as e: - print(f"Error: {e}") - return [] - -def processing_similar(): - grouped_similar_articles_result = find_and_group_similar_articles() - - if grouped_similar_articles_result: - for group in grouped_similar_articles_result: - articles = [] - - if len(group) > 1: - for article_tuple in group: - if len(article_tuple) >= 2: - title, link = article_tuple[:2] - article = [title, link] - articles.append(article) - l = len(articles) - - if l == 2: - a_one = articles[0][0] - a_two = articles[1][0] - - get_one = get_specific_data(a_one) - get_two = get_specific_data(a_two) - - text1 = get_one[0][1] - text2 = get_two[0][1] - link1 = get_one[0][2] - link2 = get_two[0][2] - if link1 != link2: - link = f"{link1}, {link2}" - else: - link = link1 - - ftoks = num_tokens_from_string(text1) - stoks = num_tokens_from_string(text2) - tokens = ftoks + stoks - - similar_d = f"C: {a_one}, {a_two}" - - modify_similar_data(similar_d, a_one) - preparing_articles(False, a_one) - - modify_similar_data(similar_d, a_two) - preparing_articles(False, a_two) - - if tokens > 2000: - combined_text = f"{text1} {text2}" - combined_text = slice_text_at_2k_tokens(combined_text) - user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field" - else: - user_message = rf"Here are 2 texts {text1} {text2}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." - - if l == 3: - a_one = articles[0][0] - a_two = articles[1][0] - a_three = articles[2][0] - - get_one = get_specific_data(a_one) - get_two = get_specific_data(a_two) - get_three = get_specific_data(a_three) - - text1 = get_one[0][1] - text2 = get_two[0][1] - text3 = get_three[0][1] - link1 = get_one[0][2] - link2 = get_two[0][2] - link3 = get_three[0][2] - if link1 != link2: - if link2 != link3: - link = f"{link1}, {link2}, {link3}" - else: - link = f"{link1}, {link2}" - else: - if link2 != link3: - link = f"{link1}, {link3}" - else: - link = link1 - ftoks = num_tokens_from_string(text1) - stoks = num_tokens_from_string(text2) - ttoks = num_tokens_from_string(text3) - tokens = ftoks + stoks + ttoks - - similar_d = f"C: {a_one}, {a_two}, {a_three}" - modify_similar_data(similar_d, a_one) - preparing_articles(False, a_one) - - modify_similar_data(similar_d, a_two) - preparing_articles(False, a_two) - - modify_similar_data(similar_d, a_three) - preparing_articles(False, a_three) - - if tokens > 2000: - combined_text = f"{text1} {text2} {text3}" - combined_text = slice_text_at_2k_tokens(combined_text) - user_message = rf"Here is text {combined_text}, combined from 3 sources, filter text, and make news content, return as JSON only with single 'content' field" - else: - user_message = rf"Here are 3 texts {text1} {text2} and {text3}, combine the following texts into a cohesive news remove any non-news related to both texts and provide the cleaned data as a JSON only with single 'content' field." - if l == 4: - a_one = articles[0][0] - a_two = articles[1][0] - a_three = articles[2][0] - a_four = articles[3][0] - - get_one = get_specific_data(a_one) - get_two = get_specific_data(a_two) - get_three = get_specific_data(a_three) - get_four = get_specific_data(a_four) - - text1 = get_one[0][1] - text2 = get_two[0][1] - text3 = get_three[0][1] - text4 = get_four[0][1] - link1 = get_one[0][2] - link2 = get_two[0][2] - link3 = get_three[0][2] - link4 = get_four[0][2] - - if link1 != link2: - if link2 != link3: - if link3 != link4: - link = f"{link1}, {link2}, {link3}, {link4}" - else: - link = f"{link1}, {link2}, {link3}" - else: - if link3 != link4: - link = f"{link1}, {link2}, {link4}" - else: - link = f"{link1}, {link2}" - else: - if link2 != link3: - if link3 != link4: - link = f"{link1}, {link3}, {link4}" - else: - link = f"{link1}, {link3}" - else: - if link3 != link4: - link = f"{link1}, {link4}" - else: - link = link1 - - ftoks = num_tokens_from_string(text1) - stoks = num_tokens_from_string(text2) - ttoks = num_tokens_from_string(text3) - frtoks = num_tokens_from_string(text4) - - tokens = ftoks + stoks + ttoks + frtoks - - similar_d = f"C: {a_one}, {a_two}, {a_three}, {a_four}" - modify_similar_data(similar_d, a_one) - preparing_articles(False, a_one) - - modify_similar_data(similar_d, a_two) - preparing_articles(False, a_two) - - modify_similar_data(similar_d, a_three) - preparing_articles(False, a_three) - - modify_similar_data(similar_d, a_four) - preparing_articles(False, a_four) - - if tokens > 2000: - combined_text = f"{text1} {text2} {text3} {text4}" - combined_text = slice_text_at_2k_tokens(combined_text) - user_message = rf"Here is text {combined_text}, combined from 4 sources, filter text, and make news content, return as JSON only with a single 'content' field" - else: - user_message = rf"Here are 4 texts {text1} {text2} {text3} and {text4}, combine the following texts into a cohesive news, remove any non-news related to all texts, and provide the cleaned data as a JSON only with a single 'content' field." - try: - completion = client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "system", "content": "Data analytic, Journalist and News reporter"}, - {"role": "user", "content": user_message} - ]) - generated_text = completion.choices[0].message.content - - generated_text = repair_json(generated_text) - - response_data = json.loads(generated_text) - title = a_one - text = response_data["content"] - vector = embeddings.embed_query(generated_text) - - insert_data(title, text, link, vector, similar_d) - print(f"Inserting combined: {title}") - - except Exception as e: - print(f"Error: {e}") - print(a_one) - continue - else: - print("Done!.") - else: - print("No similar articles found.") -if __name__=="__main__": - processing_similar() diff --git a/pyth/checking_similar.py b/pyth/checking_similar.py new file mode 100644 index 0000000..2a38b82 --- /dev/null +++ b/pyth/checking_similar.py @@ -0,0 +1,122 @@ +import psycopg2 +from dotenv import load_dotenv +import os +from openai import OpenAI +from langchain_openai import OpenAIEmbeddings +from db_management import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity, get_titles_links_embeddings +from get_articles import slice_text_at_2k_tokens +import json +from json_repair import repair_json +from publishing_finals import publish_articles + +load_dotenv() + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +client = OpenAI() +embeddings = OpenAIEmbeddings() + +print("Checking for similar!") + + +def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95): + try: + titles, links, embeddings = get_titles_links_embeddings() + + processed_articles = set() + grouped_similar_articles = [] + + for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)): + if (title1, link1) not in processed_articles: + processed_articles.add((title1, link1)) + group = [(title1, link1)] + + for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)): + if i != j and (title2, link2) not in processed_articles: + similarity = calculate_cosine_similarity(embedding1, embedding2) + + if similarity > threshold: + if link1 != link2: + processed_articles.add((title2, link2)) + group.append((title2, link2, embedding2)) + + grouped_similar_articles.append(group) + return grouped_similar_articles + + except psycopg2.Error as e: + print(f"Error: {e}") + return [] + + +def processing_articles(articles): + unique_links = set() + + for article in articles: + a_title, a_link = article[:2] + get_data = get_specific_data(a_title) + text = get_data[0][1] + link = a_link + + modify_similar_data(f"C: {', '.join(art[0] for art in articles)}", a_title) + preparing_articles(False, a_title) + + if link not in unique_links: + unique_links.add(link) + + combined_text = ' '.join(get_specific_data(art[0])[0][1] for art in articles) + combined_text = slice_text_at_2k_tokens(combined_text) + + if len(unique_links) == 1: + link = next(iter(unique_links)) + else: + link = ', '.join(unique_links) + return combined_text, link + + +def processing_similar(): + grouped_similar_articles_result = find_and_group_similar_articles() + + if grouped_similar_articles_result: + for group in grouped_similar_articles_result: + articles = group + + if len(articles) > 1: + combined_text, link = processing_articles(articles) + user_message = ( + rf"Here are {len(articles)} texts {combined_text}, combine the following texts into a cohesive news, " + rf"remove any non-news related to all texts, and provide the cleaned data on Bosnian languageas and return as JSON only with a single 'content' field." + ) + + try: + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "Data analytic, Journalist and News reporter"}, + {"role": "user", "content": user_message} + ]) + generated_text = repair_json(completion.choices[0].message.content) + + response_data = json.loads(generated_text) + title = articles[0][0] + text = response_data["content"] + vector = embeddings.embed_query(generated_text) + tmpCategory = get_specific_data(title) + category = tmpCategory[0][5] + + + + insert_data(title, text, link, vector, f"C: {', '.join(art[0] for art in articles)}", category) + print(f"Inserting combined: {title} and Category: {category}") + + except Exception as e: + print(f"Error: {e}") + print(articles[0][0]) + continue + else: + print("Done!.") + else: + print("No similar articles found.") + + +if __name__ == "__main__": + processing_similar() + publish_articles() diff --git a/pyth/vectData.py b/pyth/db_management.py similarity index 71% rename from pyth/vectData.py rename to pyth/db_management.py index e3deda7..bd9dd7e 100644 --- a/pyth/vectData.py +++ b/pyth/db_management.py @@ -68,7 +68,7 @@ def is_similar_data(title, text, link, embedding, threshold=0.98): def get_similar(): cursor = conn.cursor() - query = '''SELECT title,similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')''' + query = '''SELECT title, link, similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')''' cursor.execute(query) similar_data = cursor.fetchall() cursor.close() @@ -87,18 +87,23 @@ def get_titles_links_embeddings(): return titles, links, embeddings -def insert_data(title, text, link, embedding, similar_d): +def insert_data(title, text, link, embedding, similar_d,category): c_time = datetime.now() cursor = conn.cursor() cursor.execute(''' - INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready) - VALUES (%s, %s, %s, %s, %s ,%s ,%s); - ''', (title, text, link, embedding , similar_d, c_time, True)) + INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready, category) + VALUES (%s, %s, %s, %s, %s ,%s ,%s ,%s); + ''', (title, text, link, embedding , similar_d, c_time, True , category)) conn.commit() cursor.close() -def get_data(): +def insert_final(title,text,slug,link,source_id, category): + with conn.cursor() as cursor: + cursor.execute('''INSERT INTO articles (title, content, slug, original_url, source_id, category) + VALUES (%s, %s, %s, %s, %s, %s)ON CONFLICT (original_url) DO NOTHING;''',(title , text, slug, link, source_id, category)) + conn.commit() +def get_data(): cursor = conn.cursor() query = '''SELECT title,text,link FROM vectorsvevijesti;''' cursor.execute(query) @@ -108,7 +113,7 @@ def get_data(): def get_ready_data(): cursor = conn.cursor() - query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;''' + query = '''SELECT title, text, link, time, similar_d, category FROM vectorsvevijesti WHERE ready = %s;''' cursor.execute(query, ('True',)) data = cursor.fetchall() cursor.close() @@ -122,14 +127,12 @@ def get_source_data(): cursor.close() return data - def modify_similar_data(new_value ,title): cursor = conn.cursor() query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s ''' cursor.execute(query, (new_value, title)) conn.commit() - def preparing_articles(new_value ,title): cursor = conn.cursor() query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s ''' @@ -138,13 +141,12 @@ def preparing_articles(new_value ,title): def get_specific_data(title): cursor = conn.cursor() - query = '''SELECT title, text, link, similar_d, embedding, ready FROM vectorsvevijesti WHERE title = %s''' + query = '''SELECT title, text, link, similar_d, embedding, category, ready FROM vectorsvevijesti WHERE title = %s''' cursor.execute(query, (title,)) specific_post = cursor.fetchall() cursor.close() return specific_post - def get_all_links(): cursor = conn.cursor() query = '''SELECT link FROM vectorsvevijesti''' @@ -153,6 +155,14 @@ def get_all_links(): cursor.close() return db_links +def get_existing_titles(): + cursor = conn.cursor() + query = '''SELECT title, original_url FROM articles''' + cursor.execute(query) + db_links = {link[0] for link in cursor.fetchall()} + cursor.close() + return db_links + def delete_specific(title): cursor = conn.cursor() query = '''DELETE FROM vectorsvevijesti WHERE title = %s''' @@ -192,4 +202,48 @@ def create_db(): ''') conn.commit() cursor.close() + +def create_db(): + cursor = conn.cursor() + cursor.execute("CREATE EXTENSION IF NOT EXISTS vector") + register_vector(conn) + cursor.execute(''' + CREATE TABLE IF NOT EXISTS vectorsvevijesti ( + id bigserial PRIMARY KEY, + title VARCHAR, + text VARCHAR, + link VARCHAR, + embedding vector(1536), + similar_d VARCHAR, + time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + ready BOOLEAN, + category VARCHAR + ); + ''') + conn.commit() + cursor.close() + +def create_ar_table(): + cursor = conn.cursor() + cursor.execute(''' + CREATE TABLE IF NOT EXISTS "articles" ( + "id" bigserial PRIMARY KEY, + "title" text NOT NULL UNIQUE, + "content" text NOT NULL, + "slug" text NOT NULL UNIQUE, + "created_at" timestamptz DEFAULT NOW() NOT NULL, + "original_url" text NOT NULL UNIQUE, + "source_id" int NOT NULL, + "category" VARCHAR + + ); + ''') + conn.commit() + cursor.close() + +import psycopg2 +from psycopg2 import sql + + create_db() +create_ar_table() diff --git a/pyth/delete_db.py b/pyth/delete_db.py new file mode 100644 index 0000000..960012e --- /dev/null +++ b/pyth/delete_db.py @@ -0,0 +1,2 @@ +from db_management import delete_tables +delete_tables() \ No newline at end of file diff --git a/pyth/scrapingsingle.py b/pyth/get_articles.py similarity index 50% rename from pyth/scrapingsingle.py rename to pyth/get_articles.py index 672ba87..20d36f3 100644 --- a/pyth/scrapingsingle.py +++ b/pyth/get_articles.py @@ -3,8 +3,8 @@ import requests from urllib.parse import urljoin from openai import OpenAI import os -from langchain.embeddings import OpenAIEmbeddings -from vectData import (insert_data ,is_similar_data ,get_all_links,cleansing ) +from langchain_openai import OpenAIEmbeddings +from db_management import (insert_data ,is_similar_data ,get_all_links,cleansing ) import json from dotenv import load_dotenv import tiktoken @@ -18,7 +18,7 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") client = OpenAI() embeddings = OpenAIEmbeddings() -dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info'] +dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info','https://www.index.hr', 'https://avaz.ba', 'https://www.telegraf.rs', 'https://www.blic.rs', 'https://www.vijesti.me','https://dnevnik.hr','https://24sata.hr'] headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'} def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int: @@ -97,50 +97,65 @@ final_links = fix_links(final_links) if __name__ == '__main__': - for link in final_links: - response = requests.get(link,headers) - soup = BeautifulSoup(response.text, 'html.parser') + for link in final_links: + if link not in db_links: + print(f"Processing link: {link}") + db_links.add(link) - titles = soup.find_all(['h2', 'h1','h3']) - title_text = ' '.join([title.get_text(strip=True) for title in titles]) + response = requests.get(link,headers) + soup = BeautifulSoup(response.text, 'html.parser') - texts = soup.find_all(['p']) - text_text = ' '.join([text.get_text(strip=True) for text in texts]) + titles = soup.find_all(['h2', 'h1','h3']) + title_text = ' '.join([title.get_text(strip=True) for title in titles]) - text_text = text_text - title_text = title_text + texts = soup.find_all(['p']) + text_text = ' '.join([text.get_text(strip=True) for text in texts]) + + text_text = text_text + title_text = title_text - title_text = replace_with_spaces(title_text) + title_text = replace_with_spaces(title_text) - text_text = slice_text_at_2k_tokens(text_text) - text_text = replace_with_spaces(str(text_text)) + text_text = slice_text_at_2k_tokens(text_text) + text_text = replace_with_spaces(str(text_text)) - ttk = num_tokens_from_string(text_text) + ttk = num_tokens_from_string(text_text) - if ttk > 1900: - title_text = slice_title_if_needed(title_text) - try: - completion = client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "system", "content": "Data analytic, Journalist and News reporter"}, - {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data make sure that its valid JSON object with 'title' field and 'content' field."} - ] - ) - generated_text = completion.choices[0].message.content + category_options = ['politics','business','sport','magazine','scitech'] - generated_text = repair_json(generated_text) + if ttk > 1900: + title_text = slice_title_if_needed(title_text) + try: + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "Data analytic, Journalist and News reporter"}, + {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title and remove 'FOTO' and 'VIDEO' from title and text, from {category_options} select category in wich that news belong, and provide the cleaned data make sure that its on Bosnian language and valid JSON object with 'title' field, 'category' and 'content' field."} + ]) + generated_text = completion.choices[0].message.content - response_data = json.loads(generated_text) - title = response_data["title"] - text = response_data["content"] - vector = embeddings.embed_query(generated_text) + generated_text = repair_json(generated_text) + + response_data = json.loads(generated_text) + title = response_data["title"] + predicted_category = response_data["category"] + text = response_data["content"] + + if predicted_category.lower() in category_options: + category = predicted_category.lower() + else: + category = 'other' + + vector = embeddings.embed_query(generated_text) + + print(f"Title: {title}") + print(f"Category: {category}") - if not is_similar_data(title, text, link, vector, threshold=0.98): - similar_d = "NO" - insert_data(title, text, link, vector,similar_d) + if not is_similar_data(title, text, link, vector, threshold=0.98): + similar_d = "NO" + insert_data(title, text, link, vector,similar_d,category) - except Exception as e: - print(f"Error in completion: {e}") - continue + except Exception as e: + print(f"Error in completion: {e}") + continue diff --git a/pyth/publishing_finals.py b/pyth/publishing_finals.py new file mode 100644 index 0000000..3a99411 --- /dev/null +++ b/pyth/publishing_finals.py @@ -0,0 +1,69 @@ +from slugify import slugify +import random +from db_management import get_ready_data,insert_final,get_existing_titles + +def create_slug(title): + base_slug = "{} {}".format(random.randint(1, 1000), title) + slug = slugify(base_slug) + return slug + +def get_source_id(link,similar): + if similar == "NO": + if "srpskainfo" in link: + return 1 + elif "klix" in link: + return 2 + elif "bljesak" in link: + return 3 + elif "blic" in link: + return 4 + elif "index.hr" in link: + return 6 + elif "avaz" in link: + return 7 + elif "telegraf" in link: + return 8 + elif "vijesti.me" in link: + return 9 + elif "dnevnik.hr" in link: + return 10 + elif "24sata.hr" in link: + return 11 + else: + return 0 + else: + return 5 + +data = get_ready_data() + +def remove_braces_and_quotes(text): + final_text = text.replace('{"', '') + final_text = final_text.replace('"}', '') + + return final_text + + +def publish_articles(): + for d in data: + title = d[0] + text = d[1] + link = d[2] + similar_d = d[4] + category = d[5] + slug = create_slug(title) + source_id = get_source_id(link,similar_d) + + check = get_existing_titles() + + title_check = any(title in t for t in check) + link_check = any(link in l for l in check) + + if title_check or link_check: + continue + else: + text = remove_braces_and_quotes(text) + title = remove_braces_and_quotes(title) + print(f"Source: {source_id}") + print(f"Link: {link}") + insert_final(title, text, slug, link, source_id, category) + print(f"Publishing: {title}") diff --git a/pyth/templates/index.html b/pyth/templates/index.html deleted file mode 100644 index c9e51c1..0000000 --- a/pyth/templates/index.html +++ /dev/null @@ -1,22 +0,0 @@ - - -
- - -Test Text
- - \ No newline at end of file diff --git a/pyth/templates/two.html b/pyth/templates/two.html deleted file mode 100644 index bcba718..0000000 --- a/pyth/templates/two.html +++ /dev/null @@ -1,12 +0,0 @@ - - - - - -Test Text
- - \ No newline at end of file diff --git a/pyth/tests/test_scrapingsingle.py b/pyth/tests/test_scrapingsingle.py index 5afcfda..8830373 100644 --- a/pyth/tests/test_scrapingsingle.py +++ b/pyth/tests/test_scrapingsingle.py @@ -7,7 +7,7 @@ from langchain.vectorstores.pgvector import PGVector from openai import OpenAI import json from dotenv import load_dotenv -from scrapingsingle import get_article_links, insert_data, is_similar_data +from pyth.get_articles import get_article_links, insert_data, is_similar_data import os load_dotenv() diff --git a/pyth/tests/test_vectData.py b/pyth/tests/test_vectData.py index 99d4dd6..6450d75 100644 --- a/pyth/tests/test_vectData.py +++ b/pyth/tests/test_vectData.py @@ -2,7 +2,7 @@ import unittest import numpy as np import psycopg2 import os -from vectData import calculate_cosine_similarity, is_similar_data, insert_data, get_data, create_db +from pyth.db_management import calculate_cosine_similarity, is_similar_data, insert_data, get_data, create_db class TestIntegration(unittest.TestCase): host = os.getenv("DB_HOST") diff --git a/pyth/web-server.py b/pyth/web-server.py deleted file mode 100644 index ed1dc44..0000000 --- a/pyth/web-server.py +++ /dev/null @@ -1,29 +0,0 @@ -from flask import Flask , render_template , jsonify -from vectData import get_ready_data -from flask_cors import CORS - - -app = Flask(__name__) - -CORS(app) - -@app.route('/') -def index() : - return render_template("index.html") - - -@app.route('/article/one') -def articleone(): - return render_template("one.html") - - -@app.route('/article/two') -def articletwo(): - return render_template("two.html") - -@app.route('/data/get/news', methods=['GET']) -def takenews(): - data = get_ready_data() - return jsonify(data) - -app.run(debug=True) \ No newline at end of file diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..efa7987 --- /dev/null +++ b/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +exec kill -9 $(lsof -t -i:8080) +exec go run cmd/web/web.go diff --git a/server b/server old mode 100755 new mode 100644 diff --git a/spider b/spider old mode 100755 new mode 100644 diff --git a/web/data/articles.html b/web/data/articles.html index 66c0d3c..2931132 100644 --- a/web/data/articles.html +++ b/web/data/articles.html @@ -1,16 +1,82 @@ {{define "articlesHTML"}} -
_____ ______ ____ ____ ___ ____ ___ __ __ ____ ____ ___
/ ___/| | / || \ / _]| \ / \ | | || || \ / _]
@@ -10,11 +12,51 @@
\___| |__| |__|__||__|\_||_____||__|__| \___/ \_/ |____||__|__||_____|
-