Compare commits

43 Commits

Author SHA1 Message Date
1ed2a7dd98 Merge branch 'svevijesti-upokusaju' into 'master'
Svevijesti upokusaju

See merge request kbr4/svevijesti!6
2024-02-01 11:44:21 +00:00
193deabdef small fixex of position 2024-02-01 12:39:48 +01:00
dc9992cba0 fixing menu icon, and category name 2024-01-31 23:25:28 +01:00
232df1e4e0 Weather and Category 2024-01-31 12:37:55 +01:00
f4a2251178 Changing from js to golang 2024-01-29 14:55:20 +01:00
30d8ca73da Merge branch 'svevijesti-cs' into 'master'
Combine similar article

See merge request kbr4/svevijesti!5
2024-01-08 09:45:03 +00:00
54a41046ce Fixed response/JSON 2024-01-08 00:28:20 +01:00
b7a0e5478c organizing code 2024-01-07 03:41:32 +01:00
96a2d88895 Removing previous f. 2024-01-06 08:26:31 +01:00
d4e99c7c5f added article.py 2024-01-06 08:17:05 +01:00
ae1c1902da Combine similar article 2024-01-02 15:00:07 +01:00
fff1c94a3d Merge branch 'svevijesti-as' into 'master'
Fresh

See merge request kbr4/svevijesti!2
2023-12-25 12:46:29 +00:00
459dfd08aa Update .gitlab-ci.yml file 2023-12-13 10:49:45 +00:00
Senad Uka
ef6fce64d8 Fix avaz - skip english 2022-02-22 21:12:12 +01:00
Senad Uka
c8d361f458 Added avaz / fixed install script 2022-02-22 21:06:27 +01:00
Senad Uka
2db4b0b2e2 Fix crawler 2022-02-22 13:23:35 +01:00
Senad Uka
0e9956a227 Fix scrapers 2022-02-22 05:18:39 +01:00
Senad Uka
4b863369b6 Fix scrapers 2022-02-17 19:14:53 +01:00
Senad Uka
cb20d84803 Fix scrapers 2022-02-17 19:00:57 +01:00
Senad Uka
135cf90e7a Fix scrapers 2022-02-17 18:58:51 +01:00
Senad Uka
5f22302e44 Fix scrapers 2022-02-17 18:58:16 +01:00
Senad Uka
f4ea31373d FIx bljesak crawler 2022-02-17 07:16:35 +01:00
Senad Uka
a748bcb303 FIx sprskainfo crawler 2022-02-17 04:29:37 +01:00
Senad Uka
31167ff740 Fix bug with article navigation 2022-02-16 20:02:05 +01:00
Senad Uka
651e1b4933 Fix time format 2022-02-16 19:16:33 +01:00
Senad Uka
ba918d1f57 Add navigation 2022-02-16 19:11:48 +01:00
Senad Uka
47987ff395 Added bljesak.info 2022-02-15 18:00:30 +01:00
Senad Uka
6648f6754a Added srpskainfo crawler 2022-02-15 07:03:30 +01:00
Senad Uka
08d81be857 Fix the title 2022-02-15 05:43:52 +01:00
Senad Uka
81da0a29a6 Add install script 2022-02-15 05:41:24 +01:00
Senad Uka
cdce8d3c67 Add install script 2022-02-15 05:39:07 +01:00
Senad Uka
089fa2b57f og title change 2022-02-15 05:35:27 +01:00
Senad Uka
b3d4928663 Now fix bug with too many open connecitons 2022-02-15 04:49:40 +01:00
Senad Uka
7cb3620040 Fix bug with too many open connecitons 2022-02-15 04:30:31 +01:00
Senad Uka
33fe5b8a6a Build server 2022-02-14 20:01:56 +01:00
Senad Uka
aed3ba0dc1 Listen on 127.0.0.1 2022-02-14 20:00:15 +01:00
Senad Uka
a9441672ae Listen on all IPs 2022-02-14 19:55:15 +01:00
Senad Uka
c6af5ab6c6 Init script 2022-02-14 16:53:02 +01:00
Senad Uka
f0ba1ea2e4 Server binary 2022-02-14 11:05:40 +01:00
Senad Uka
9f57520080 Detalji clanaka 2022-02-14 11:03:56 +01:00
Senad Uka
f6e90deebd First version of web UI 2022-02-13 05:12:49 +01:00
Senad Uka
a040320827 Prva verzija - klix scraper 2022-02-10 21:11:13 +01:00
Senad Uka
08f0de07c3 Initial commit 2022-02-06 00:43:15 +01:00
48 changed files with 1865 additions and 287 deletions

9
.env Normal file
View File

@@ -0,0 +1,9 @@
OPENAI_API_KEY = "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"
DB_HOST =localhost
DB_PORT =5432
DB_USER =svevijesti
DB_PASSWORD =salmonela pljusti 221 hamo
DB_NAME =svevijestiweb
API_KEY=abb35e21bdcbad6d1b00141a2b25cf5a

BIN
cmd/web/svevijesti Normal file

Binary file not shown.

1
go.mod
View File

@@ -15,6 +15,7 @@ require (
github.com/gorilla/mux v1.8.0 // indirect
github.com/gosimple/slug v1.12.0 // indirect
github.com/gosimple/unidecode v1.0.1 // indirect
github.com/joho/godotenv v1.5.1 // indirect
github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/lib/pq v1.10.4 // indirect
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect

2
go.sum
View File

@@ -25,6 +25,8 @@ github.com/gosimple/slug v1.12.0 h1:xzuhj7G7cGtd34NXnW/yF0l+AGNfWqwgh/IXgFy7dnc=
github.com/gosimple/slug v1.12.0/go.mod h1:UiRaFH+GEilHstLUmcBgWcI42viBN7mAb818JrYOeFQ=
github.com/gosimple/unidecode v1.0.1 h1:hZzFTMMqSswvf0LBJZCZgThIZrpDHFXux9KeGmn6T/o=
github.com/gosimple/unidecode v1.0.1/go.mod h1:CP0Cr1Y1kogOtx0bJblKzsVWrqYaqfNOnHzpgWw4Awc=
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
github.com/lib/pq v1.10.4 h1:SO9z7FRPzA03QhHKJrH5BXA6HU1rS4V2nIVrrNC1iYk=

View File

@@ -2,12 +2,13 @@ package database
import (
"fmt"
_ "github.com/lib/pq"
"gitlab.com/kbr4/svevijesti/internal/model"
"html/template"
"math"
"strings"
"time"
_ "github.com/lib/pq"
"gitlab.com/kbr4/svevijesti/internal/model"
)
func InsertArticle(store *Store, article model.ScrapedArticle) (err error) {
@@ -48,7 +49,7 @@ func ArticlesForDay(store *Store, day time.Time) (articles []model.DisplayArticl
result := []model.DisplayArticle{}
query, err := store.Prepare(`
select id,title, content, slug, original_url, source_id, created_at from articles where created_at > $1 and created_at < $2 and LENGTH(content) > 10 order by id desc;
select id,title, content, slug, original_url, source_id, created_at, category from articles where created_at > $1 and created_at < $2 and LENGTH(content) > 10 order by id desc;
`)
if err != nil {
return result, err
@@ -67,7 +68,7 @@ func ArticlesForDay(store *Store, day time.Time) (articles []model.DisplayArticl
for rows.Next() {
r := model.DisplayArticle{}
err = rows.Scan(&r.ID, &r.Title, &r.Content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt)
err = rows.Scan(&r.ID, &r.Title, &r.Content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt, &r.Category)
if err != nil {
return result, err
}
@@ -95,7 +96,7 @@ func ArticleByID(store *Store, ID int, slug string) (article model.DisplayArticl
result := model.DisplayArticle{}
query, err := store.Prepare(`
select id,title, content, slug, original_url, source_id, created_at from articles where id = $1 and slug = $2;
select id,title, content, slug, original_url, source_id, created_at, category from articles where id = $1 and slug = $2;
`)
if err != nil {
return result, err
@@ -109,7 +110,7 @@ func ArticleByID(store *Store, ID int, slug string) (article model.DisplayArticl
r := model.DisplayArticle{}
content := ""
err = row.Scan(&r.ID, &r.Title, &content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt)
err = row.Scan(&r.ID, &r.Title, &content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt, &r.Category)
if err != nil {
return result, err
}
@@ -138,7 +139,7 @@ func PreviousAndNextArticleUrlByID(store *Store, ID int) (nextUrl string, previo
nextResult, previousResult := "#", "#"
query, err := store.Prepare(`
select id,title, content, slug, original_url, source_id, created_at from articles where id < $1 and id > $2 order by id desc limit 1;
select id,title, content, slug, original_url, source_id, created_at, category from articles where id < $1 and id > $2 order by id desc limit 1;
`)
if err != nil {
fmt.Println("Err 1:", err)
@@ -154,7 +155,7 @@ func PreviousAndNextArticleUrlByID(store *Store, ID int) (nextUrl string, previo
r := model.DisplayArticle{}
content := ""
err = row.Scan(&r.ID, &r.Title, &content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt)
err = row.Scan(&r.ID, &r.Title, &content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt, &r.Category)
if err != nil {
return nextResult, previousResult, err
}
@@ -162,7 +163,7 @@ func PreviousAndNextArticleUrlByID(store *Store, ID int) (nextUrl string, previo
previousResult = fmt.Sprintf("/%d/%s", r.ID, r.Slug)
query2, err := store.Prepare(`
select id,title, content, slug, original_url, source_id, created_at from articles where id < $1 and id > $2 order by id asc limit 1;
select id,title, content, slug, original_url, source_id, created_at, category from articles where id < $1 and id > $2 order by id asc limit 1;
`)
if err != nil {
fmt.Println("Err 1:", err)
@@ -177,7 +178,7 @@ func PreviousAndNextArticleUrlByID(store *Store, ID int) (nextUrl string, previo
}
content = ""
err = row.Scan(&r.ID, &r.Title, &content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt)
err = row.Scan(&r.ID, &r.Title, &content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt, &r.Category)
if err != nil {
fmt.Println("Err 4:", err)
return nextResult, previousResult, err

View File

@@ -23,6 +23,7 @@ type DisplayArticle struct {
CreatedAt time.Time
FormatedCreatedAt string
SourceName string
Category string
}
const (

View File

@@ -2,11 +2,12 @@ package server
import (
"fmt"
"github.com/gorilla/mux"
"gitlab.com/kbr4/svevijesti/internal/database"
"net/http"
"strconv"
"time"
"github.com/gorilla/mux"
"gitlab.com/kbr4/svevijesti/internal/database"
)
func rootHandler(wr http.ResponseWriter, req *http.Request) {
@@ -24,11 +25,25 @@ func rootHandler(wr http.ResponseWriter, req *http.Request) {
dayBefore := "/dan/" + time.Now().Add(-24*time.Hour).Format("2006-01-02")
cities := []string{"Sarajevo", "Banja Luka", "Zenica", "Tuzla", "Mostar"}
var weatherInfo []WeatherData
for _, city := range cities {
data, err := getWeather(city)
if err != nil {
fmt.Printf("Error fetching weather for %s: %v\n", city, err)
continue
}
weatherInfo = append(weatherInfo, data)
}
data := map[string]interface{}{
"title": title,
"articles": articles,
"previous": dayBefore,
"next": "/",
"title": title,
"articles": articles,
"previous": dayBefore,
"next": "/",
"weatherInfo": weatherInfo,
"categories": CategoryMenu,
}
err = templates.ExecuteTemplate(wr, "homeHTML", data)
@@ -62,11 +77,25 @@ func dailyArticlesHandler(wr http.ResponseWriter, req *http.Request) {
http.Error(wr, err.Error(), http.StatusInternalServerError)
}
cities := []string{"Sarajevo", "Banja Luka", "Zenica", "Tuzla", "Mostar"}
var weatherInfo []WeatherData
for _, city := range cities {
data, err := getWeather(city)
if err != nil {
fmt.Printf("Error fetching weather for %s: %v\n", city, err)
continue
}
weatherInfo = append(weatherInfo, data)
}
data := map[string]interface{}{
"title": title,
"articles": articles,
"previous": dayBefore,
"next": dayAfter,
"title": title,
"articles": articles,
"previous": dayBefore,
"next": dayAfter,
"weatherInfo": weatherInfo,
"categories": CategoryMenu,
}
err = templates.ExecuteTemplate(wr, "homeHTML", data)
@@ -97,10 +126,11 @@ func articleHandler(wr http.ResponseWriter, req *http.Request) {
title := article.Title
data := map[string]interface{}{
"title": title,
"article": article,
"previous": previous,
"next": next,
"title": title,
"article": article,
"previous": previous,
"next": next,
"categories": CategoryMenu,
}
err = templates.ExecuteTemplate(wr, "articleHTML", data)

View File

@@ -0,0 +1,68 @@
package server
import (
"net/http"
"time"
"github.com/gorilla/mux"
"gitlab.com/kbr4/svevijesti/internal/database"
"gitlab.com/kbr4/svevijesti/internal/model"
)
var CategoryMenu = []string{
"Politika",
"Biznis",
"Sport",
"Magazin",
"Scitech",
"Ostalo",
}
func handleCategory(wr http.ResponseWriter, r *http.Request) {
vars := mux.Vars(r)
category := vars["category"]
store, err := database.Connect()
if err != nil {
http.Error(wr, err.Error(), http.StatusInternalServerError)
return
}
defer store.Close()
currentDate, err := time.Parse("2006-01-02", category)
if err != nil {
currentDate = time.Now()
}
articles, err := database.ArticlesForDay(store, currentDate)
if err != nil {
http.Error(wr, err.Error(), http.StatusInternalServerError)
return
}
articlesByCategory := make(map[string][]model.DisplayArticle)
for _, article := range articles {
articlesByCategory[article.Category] = append(articlesByCategory[article.Category], article)
}
var categories []string
for cat := range articlesByCategory {
categories = append(categories, cat)
}
prevDay := currentDate.AddDate(0, 0, -1)
data := map[string]interface{}{
"title": category,
"currentCategory": category,
"articles": articlesByCategory[category],
"categories": CategoryMenu,
"previous": prevDay.Format("2006-01-02"),
"next": "/",
}
err = templates.ExecuteTemplate(wr, "categoryHTML", data)
if err != nil {
http.Error(wr, err.Error(), http.StatusInternalServerError)
return
}
}

View File

@@ -2,11 +2,12 @@ package server
import (
"fmt"
"github.com/gorilla/mux"
"html/template"
"io/ioutil"
"path/filepath"
"strings"
"github.com/gorilla/mux"
)
var tPath = "./web/tpl/"
@@ -42,5 +43,7 @@ func CreateRoutes() *mux.Router {
r.HandleFunc("/dan/{date}", dailyArticlesHandler)
r.HandleFunc("/{id:[0-9]+}/{slug}", articleHandler)
r.HandleFunc("/", rootHandler)
r.HandleFunc("/weather", WeatherHandler)
r.HandleFunc("/{category}", handleCategory)
return r
}

104
internal/server/weather.go Normal file
View File

@@ -0,0 +1,104 @@
package server
import (
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
"os"
"github.com/joho/godotenv"
)
var apiKey string
func init() {
err := godotenv.Load()
if err != nil {
fmt.Println("Error loading .env file:", err)
os.Exit(1)
}
apiKey = os.Getenv("API_KEY")
if apiKey == "" {
fmt.Println("API_KEY environment variable not set.")
os.Exit(1)
}
}
type WeatherData struct {
Coord struct {
Lat float64 `json:"lat"`
Lon float64 `json:"lon"`
} `json:"coord"`
Weather []struct {
Description string `json:"description"`
Icon string `json:"icon"`
} `json:"weather"`
Main struct {
Temp float64 `json:"temp"`
FellsLike float64 `json:"fells_like"`
Preassure int `json:"preassure"`
Humidity int `json:"humidity"`
TempMin float64 `json:"temp_min"`
TempMax float64 `json:"temp_max"`
} `json:"main"`
Wind struct {
Speed float64 `json:"speed"`
Deg float64 `json:"deg"`
} `json:"wind"`
Clouds struct {
All int `json:"all"`
} `json:"clouds"`
Name string `json:"name"`
}
func getWeather(city string) (WeatherData, error) {
url := fmt.Sprintf("http://api.openweathermap.org/data/2.5/weather?q=%s&appid=%s&units=metric&lang=hr", city, apiKey)
resp, err := http.Get(url)
if err != nil {
return WeatherData{}, err
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return WeatherData{}, err
}
var weatherData WeatherData
err = json.Unmarshal(body, &weatherData)
if err != nil {
return WeatherData{}, err
}
return weatherData, nil
}
func WeatherHandler(w http.ResponseWriter, r *http.Request) {
cities := []string{"Sarajevo", "Banja Luka", "Zenica", "Tuzla", "Mostar"}
var weatherInfo []WeatherData
for _, city := range cities {
data, err := getWeather(city)
if err != nil {
fmt.Printf("Error fetching weather for %s: %v\n", city, err)
continue
}
weatherInfo = append(weatherInfo, data)
}
title := "Vremenska Prognoza"
data := map[string]interface{}{
"title": title,
"weatherInfo": weatherInfo,
"categories": CategoryMenu,
}
err := templates.ExecuteTemplate(w, "weatherHTML", data)
if err != nil {
fmt.Println("Error executing template:", err)
http.Error(w, "Internal Server Error", http.StatusInternalServerError)
}
}

21
pyth/.gitlab-ci.yml Normal file
View File

@@ -0,0 +1,21 @@
stages:
- test
variables:
before_script:
- pip install -r requirements.txt
test_file1:
stage: test
script:
- python -m pytest tests/test_scrapingsingle.py
only:
- master
test_file2:
stage: test
script:
- python -m pytest tests/test_vectData.py
only:
- master

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

122
pyth/checking_similar.py Normal file
View File

@@ -0,0 +1,122 @@
import psycopg2
from dotenv import load_dotenv
import os
from openai import OpenAI
from langchain_openai import OpenAIEmbeddings
from db_management import get_specific_data, modify_similar_data, insert_data, preparing_articles, calculate_cosine_similarity, get_titles_links_embeddings
from get_articles import slice_text_at_2k_tokens
import json
from json_repair import repair_json
from publishing_finals import publish_articles
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI()
embeddings = OpenAIEmbeddings()
print("Checking for similar!")
def find_and_group_similar_articles(eps=0.5, min_samples=2, threshold=0.95):
try:
titles, links, embeddings = get_titles_links_embeddings()
processed_articles = set()
grouped_similar_articles = []
for i, (title1, link1, embedding1) in enumerate(zip(titles, links, embeddings)):
if (title1, link1) not in processed_articles:
processed_articles.add((title1, link1))
group = [(title1, link1)]
for j, (title2, link2, embedding2) in enumerate(zip(titles, links, embeddings)):
if i != j and (title2, link2) not in processed_articles:
similarity = calculate_cosine_similarity(embedding1, embedding2)
if similarity > threshold:
if link1 != link2:
processed_articles.add((title2, link2))
group.append((title2, link2, embedding2))
grouped_similar_articles.append(group)
return grouped_similar_articles
except psycopg2.Error as e:
print(f"Error: {e}")
return []
def processing_articles(articles):
unique_links = set()
for article in articles:
a_title, a_link = article[:2]
get_data = get_specific_data(a_title)
text = get_data[0][1]
link = a_link
modify_similar_data(f"C: {', '.join(art[0] for art in articles)}", a_title)
preparing_articles(False, a_title)
if link not in unique_links:
unique_links.add(link)
combined_text = ' '.join(get_specific_data(art[0])[0][1] for art in articles)
combined_text = slice_text_at_2k_tokens(combined_text)
if len(unique_links) == 1:
link = next(iter(unique_links))
else:
link = ', '.join(unique_links)
return combined_text, link
def processing_similar():
grouped_similar_articles_result = find_and_group_similar_articles()
if grouped_similar_articles_result:
for group in grouped_similar_articles_result:
articles = group
if len(articles) > 1:
combined_text, link = processing_articles(articles)
user_message = (
rf"Here are {len(articles)} texts {combined_text}, combine the following texts into a cohesive news, "
rf"remove any non-news related to all texts, and provide the cleaned data on Bosnian languageas and return as JSON only with a single 'content' field."
)
try:
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
{"role": "user", "content": user_message}
])
generated_text = repair_json(completion.choices[0].message.content)
response_data = json.loads(generated_text)
title = articles[0][0]
text = response_data["content"]
vector = embeddings.embed_query(generated_text)
tmpCategory = get_specific_data(title)
category = tmpCategory[0][5]
insert_data(title, text, link, vector, f"C: {', '.join(art[0] for art in articles)}", category)
print(f"Inserting combined: {title} and Category: {category}")
except Exception as e:
print(f"Error: {e}")
print(articles[0][0])
continue
else:
print("Done!.")
else:
print("No similar articles found.")
if __name__ == "__main__":
processing_similar()
publish_articles()

249
pyth/db_management.py Normal file
View File

@@ -0,0 +1,249 @@
import psycopg2
from psycopg2 import sql
from pgvector.psycopg2 import register_vector
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
from dotenv import load_dotenv
from datetime import datetime ,timedelta
load_dotenv()
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
dbname = os.getenv("DB_NAME")
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
def calculate_cosine_similarity(v1, v2):
v1_normalized = v1 / np.linalg.norm(v1)
v2_normalized = v2 / np.linalg.norm(v2)
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
return similarity
def parse_embedding_string(embedding_str):
if isinstance(embedding_str, str):
numbers = [float(num) for num in embedding_str[1:-1].split(',')]
return np.array(numbers)
elif isinstance(embedding_str, np.ndarray):
return embedding_str
else:
raise ValueError("Invalid type for embedding_str. Must be either str or np.ndarray.")
def is_similar_data(title, text, link, embedding, threshold=0.98):
cursor = conn.cursor()
cursor.execute('SELECT title,embedding,link FROM vectorsvevijesti;')
existing_embeddings = cursor.fetchall()
for existing_embedding_tuple in existing_embeddings:
existing_title = existing_embedding_tuple[0]
existing_embedding = np.array(existing_embedding_tuple[1]).flatten()
existing_link = existing_embedding_tuple[2]
similarity = calculate_cosine_similarity(existing_embedding, embedding)
if similarity > threshold:
if link != existing_link:
similar_d = existing_title
insert_data(title,text,link,embedding,similar_d)
print(f"Similar data found: \n #{title} \n #{existing_title}")
print(f"Inserting: #{title}")
similar_d = "NO"
cursor.close()
return True
else:
print(f"Same article of same source!")
cursor.close()
return True
print(f"Inserting: #{title}")
cursor.close()
return False
def get_similar():
cursor = conn.cursor()
query = '''SELECT title, link, similar_d FROM vectorsvevijesti WHERE similar_d NOT IN ('NO', 'SOURCE')'''
cursor.execute(query)
similar_data = cursor.fetchall()
cursor.close()
return similar_data
def get_titles_links_embeddings():
cursor = conn.cursor()
cursor.execute('SELECT title, link, embedding FROM vectorsvevijesti WHERE ready = True;')
data = cursor.fetchall()
cursor.close()
titles = [row[0] for row in data]
links = [row[1] for row in data]
embeddings = [parse_embedding_string(row[2]) for row in data]
return titles, links, embeddings
def insert_data(title, text, link, embedding, similar_d,category):
c_time = datetime.now()
cursor = conn.cursor()
cursor.execute('''
INSERT INTO vectorsvevijesti (title, text, link, embedding, similar_d, time, ready, category)
VALUES (%s, %s, %s, %s, %s ,%s ,%s ,%s);
''', (title, text, link, embedding , similar_d, c_time, True , category))
conn.commit()
cursor.close()
def insert_final(title,text,slug,link,source_id, category):
with conn.cursor() as cursor:
cursor.execute('''INSERT INTO articles (title, content, slug, original_url, source_id, category)
VALUES (%s, %s, %s, %s, %s, %s)ON CONFLICT (original_url) DO NOTHING;''',(title , text, slug, link, source_id, category))
conn.commit()
def get_data():
cursor = conn.cursor()
query = '''SELECT title,text,link FROM vectorsvevijesti;'''
cursor.execute(query)
data = cursor.fetchall()
cursor.close()
return data
def get_ready_data():
cursor = conn.cursor()
query = '''SELECT title, text, link, time, similar_d, category FROM vectorsvevijesti WHERE ready = %s;'''
cursor.execute(query, ('True',))
data = cursor.fetchall()
cursor.close()
return data
def get_source_data():
cursor = conn.cursor()
query = '''SELECT title, text, link, ready FROM vectorsvevijesti WHERE ready = %s;'''
cursor.execute(query, ('False',))
data = cursor.fetchall()
cursor.close()
return data
def modify_similar_data(new_value ,title):
cursor = conn.cursor()
query = '''UPDATE vectorsvevijesti SET similar_d = %s WHERE title = %s '''
cursor.execute(query, (new_value, title))
conn.commit()
def preparing_articles(new_value ,title):
cursor = conn.cursor()
query = '''UPDATE vectorsvevijesti SET ready = %s WHERE title = %s '''
cursor.execute(query, (new_value, title))
conn.commit()
def get_specific_data(title):
cursor = conn.cursor()
query = '''SELECT title, text, link, similar_d, embedding, category, ready FROM vectorsvevijesti WHERE title = %s'''
cursor.execute(query, (title,))
specific_post = cursor.fetchall()
cursor.close()
return specific_post
def get_all_links():
cursor = conn.cursor()
query = '''SELECT link FROM vectorsvevijesti'''
cursor.execute(query)
db_links = {link[0] for link in cursor.fetchall()}
cursor.close()
return db_links
def get_existing_titles():
cursor = conn.cursor()
query = '''SELECT title, original_url FROM articles'''
cursor.execute(query)
db_links = {link[0] for link in cursor.fetchall()}
cursor.close()
return db_links
def delete_specific(title):
cursor = conn.cursor()
query = '''DELETE FROM vectorsvevijesti WHERE title = %s'''
cursor.execute(query,(title,))
cursor.close()
def cleansing():
day_long = datetime.now() - timedelta(days=1)
cursor = conn.cursor()
query = '''DELETE FROM vectorsvevijesti WHERE time < %s'''
cursor.execute(query,(day_long,))
conn.commit()
cursor.close()
def drop_table():
cursor = conn.cursor()
query = '''DROP TABLE IF EXISTS vectorsvevijesti;'''
cursor.execute(query)
conn.commit()
cursor.close()
def create_db():
cursor = conn.cursor()
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
register_vector(conn)
cursor.execute('''
CREATE TABLE IF NOT EXISTS vectorsvevijesti (
id bigserial PRIMARY KEY,
title VARCHAR,
text VARCHAR,
link VARCHAR,
embedding vector(1536),
similar_d VARCHAR,
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
ready BOOLEAN
);
''')
conn.commit()
cursor.close()
def create_db():
cursor = conn.cursor()
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
register_vector(conn)
cursor.execute('''
CREATE TABLE IF NOT EXISTS vectorsvevijesti (
id bigserial PRIMARY KEY,
title VARCHAR,
text VARCHAR,
link VARCHAR,
embedding vector(1536),
similar_d VARCHAR,
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
ready BOOLEAN,
category VARCHAR
);
''')
conn.commit()
cursor.close()
def create_ar_table():
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS "articles" (
"id" bigserial PRIMARY KEY,
"title" text NOT NULL UNIQUE,
"content" text NOT NULL,
"slug" text NOT NULL UNIQUE,
"created_at" timestamptz DEFAULT NOW() NOT NULL,
"original_url" text NOT NULL UNIQUE,
"source_id" int NOT NULL,
"category" VARCHAR
);
''')
conn.commit()
cursor.close()
import psycopg2
from psycopg2 import sql
create_db()
create_ar_table()

2
pyth/delete_db.py Normal file
View File

@@ -0,0 +1,2 @@
from db_management import delete_tables
delete_tables()

171
pyth/get_articles.py Normal file
View File

@@ -0,0 +1,171 @@
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
from openai import OpenAI
import os
from langchain_openai import OpenAIEmbeddings
from db_management import (insert_data ,is_similar_data ,get_all_links,cleansing )
import json
from dotenv import load_dotenv
import tiktoken
from json_repair import repair_json
load_dotenv()
cleansing()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI()
embeddings = OpenAIEmbeddings()
dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info','https://www.index.hr', 'https://avaz.ba', 'https://www.telegraf.rs', 'https://www.blic.rs', 'https://www.vijesti.me','https://dnevnik.hr','https://24sata.hr']
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
def num_tokens_from_string(string: str, model="gpt-3.5-turbo") -> int:
encoding = tiktoken.encoding_for_model(model)
return len(encoding.encode(string))
def slice_text_at_2k_tokens(text):
encoding_name = "gpt-3.5-turbo"
max_tokens = 1950
encoding = tiktoken.encoding_for_model(encoding_name)
tokens = encoding.encode(text)
if len(tokens) <= max_tokens:
return [text]
sliced_tokens = tokens[:max_tokens]
sliced_text = encoding.decode(sliced_tokens)
return sliced_text
def slice_title_if_needed(text):
encoding_name = "gpt-3.5-turbo"
max_tokens = 100
encoding = tiktoken.encoding_for_model(encoding_name)
tokens = encoding.encode(text)
if len(tokens) <= max_tokens:
return [text]
sliced_tokens = tokens[:max_tokens]
sliced_text = encoding.decode(sliced_tokens)
return sliced_text
def replace_with_spaces(text):
allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 "
cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text)
return cleaned_text
def fix_links(links_set):
modified_links = set()
for link in links_set:
if "www" in link:
modified_link = link.replace("www.", "")
modified_links.add(modified_link)
else:
modified_links.add(link)
return modified_links
total_links = set()
collected_news = set()
def get_article_links(url, already_checked):
response = requests.get(url,headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')
link_store = []
for article in articles:
links = article.find_all('a', href=True)
for link in links:
link_value = urljoin(url, link['href'])
if link_value not in already_checked:
link_store.append(link_value)
already_checked.add(link_value)
return link_store
already_checked = set()
for dlink in dlinks:
temp_links = get_article_links(dlink, already_checked)
if temp_links:
total_links.update(temp_links)
final_links = {item for item in total_links if item}
db_links = set(get_all_links())
new_links = final_links - db_links
final_links = new_links
final_links = set(final_links)
final_links = fix_links(final_links)
if __name__ == '__main__':
for link in final_links:
if link not in db_links:
print(f"Processing link: {link}")
db_links.add(link)
response = requests.get(link,headers)
soup = BeautifulSoup(response.text, 'html.parser')
titles = soup.find_all(['h2', 'h1','h3'])
title_text = ' '.join([title.get_text(strip=True) for title in titles])
texts = soup.find_all(['p'])
text_text = ' '.join([text.get_text(strip=True) for text in texts])
text_text = text_text
title_text = title_text
title_text = replace_with_spaces(title_text)
text_text = slice_text_at_2k_tokens(text_text)
text_text = replace_with_spaces(str(text_text))
ttk = num_tokens_from_string(text_text)
category_options = ['politics','business','sport','magazine','scitech']
category_translation = {
'politics': 'Politika',
'business': 'Biznis',
'sport': 'Sport',
'magazine': 'Magazin',
'scitech': 'Nauka i tehnologija',
'other': 'Ostalo',
}
if ttk > 1900:
title_text = slice_title_if_needed(title_text)
try:
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title and remove 'FOTO' and 'VIDEO' from title and text, from {category_options} select category in wich that news belong, and provide the cleaned data make sure that its on Bosnian language and valid JSON object with 'title' field, 'category' and 'content' field."}
])
generated_text = completion.choices[0].message.content
generated_text = repair_json(generated_text)
response_data = json.loads(generated_text)
title = response_data["title"]
predicted_category = response_data["category"]
text = response_data["content"]
if predicted_category.lower() in category_options:
category = predicted_category.lower()
else:
category = 'other'
category = category_translation.get(category, category.capitalize())
vector = embeddings.embed_query(generated_text)
print(f"Category: {category}")
if not is_similar_data(title, text, link, vector, threshold=0.98):
similar_d = "NO"
insert_data(title, text, link, vector,similar_d,category)
except Exception as e:
print(f"Error in completion: {e}")
continue

69
pyth/publishing_finals.py Normal file
View File

@@ -0,0 +1,69 @@
from slugify import slugify
import random
from db_management import get_ready_data,insert_final,get_existing_titles
def create_slug(title):
base_slug = "{} {}".format(random.randint(1, 1000), title)
slug = slugify(base_slug)
return slug
def get_source_id(link,similar):
if similar == "NO":
if "srpskainfo" in link:
return 1
elif "klix" in link:
return 2
elif "bljesak" in link:
return 3
elif "blic" in link:
return 4
elif "index.hr" in link:
return 6
elif "avaz" in link:
return 7
elif "telegraf" in link:
return 8
elif "vijesti.me" in link:
return 9
elif "dnevnik.hr" in link:
return 10
elif "24sata.hr" in link:
return 11
else:
return 0
else:
return 5
data = get_ready_data()
def remove_braces_and_quotes(text):
final_text = text.replace('{"', '')
final_text = final_text.replace('"}', '')
return final_text
def publish_articles():
for d in data:
title = d[0]
text = d[1]
link = d[2]
similar_d = d[4]
category = d[5]
slug = create_slug(title)
source_id = get_source_id(link,similar_d)
check = get_existing_titles()
title_check = any(title in t for t in check)
link_check = any(link in l for l in check)
if title_check or link_check:
continue
else:
text = remove_braces_and_quotes(text)
title = remove_braces_and_quotes(title)
print(f"Source: {source_id}")
print(f"Link: {link}")
insert_final(title, text, slug, link, source_id, category)
print(f"Publishing: {title}")

141
pyth/requirements.txt Normal file
View File

@@ -0,0 +1,141 @@
aiohttp==3.9.1
aiosignal==1.3.1
annotated-types==0.6.0
anyio==4.2.0
apturl==0.5.2
async-timeout==4.0.3
attrs==23.1.0
beautifulsoup4==4.12.2
blinker==1.7.0
blis==0.7.11
Brlapi==0.8.3
catalogue==2.0.10
certifi==2020.6.20
chardet==4.0.0
charset-normalizer==3.3.2
click==8.1.7
cloudpathlib==0.16.0
colorama==0.4.4
command-not-found==0.3
confection==0.1.4
cryptography==3.4.8
cupshelpers==1.0
cymem==2.0.8
dataclasses-json==0.6.3
DateTime==5.4
dbus-python==1.2.18
decorator==4.4.2
defer==1.0.6
distro==1.7.0
distro-info==1.1+ubuntu0.1
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
exceptiongroup==1.2.0
Flask==3.0.0
Flask-Cors==4.0.0
frozenlist==1.4.1
greenlet==1.1.2
gyp==0.1
h11==0.14.0
httpcore==1.0.2
httplib2==0.20.2
httpx==0.25.2
idna==3.3
importlib-metadata==4.6.4
itsdangerous==2.1.2
jeepney==0.7.1
Jinja2==3.1.2
joblib==1.3.2
jsonpatch==1.33
jsonpointer==2.4
keyring==23.5.0
langchain==0.0.352
langchain-community==0.0.6
langchain-core==0.1.3
langcodes==3.3.0
langsmith==0.0.74
language-selector==0.1
launchpadlib==1.10.16
lazr.restfulclient==0.14.4
lazr.uri==1.0.6
louis==3.20.0
macaroonbakery==1.3.1
MarkupSafe==2.1.3
marshmallow==3.20.1
more-itertools==8.10.0
multidict==6.0.4
murmurhash==1.0.10
mypy-extensions==1.0.0
netifaces==0.11.0
numpy==1.26.2
oauthlib==3.2.0
olefile==0.46
openai==1.5.0
packaging==23.2
pbr==5.8.0
pexpect==4.8.0
pgvector==0.2.4
Pillow==9.0.1
preshed==3.0.9
protobuf==3.12.4
psycopg==3.1.15
psycopg2-binary==2.9.9
ptyprocess==0.7.0
pycairo==1.20.1
pycups==2.0.1
pydantic==2.5.2
pydantic_core==2.14.5
PyGObject==3.42.1
PyJWT==2.3.0
pymacaroons==0.13.0
PyNaCl==1.5.0
pyparsing==2.4.7
pyRFC3339==1.1
python-apt==2.4.0+ubuntu2
python-dateutil==2.8.1
python-debian==0.1.43+ubuntu1.1
python-dotenv==1.0.0
pytz==2022.1
pyxdg==0.27
PyYAML==5.4.1
regex==2023.10.3
reportlab==3.6.8
requests==2.31.0
scikit-learn==1.3.2
scipy==1.11.4
SecretStorage==3.3.1
six==1.16.0
slugify==0.0.1
smart-open==6.4.0
sniffio==1.3.0
soupsieve==2.5
spacy==3.7.2
spacy-legacy==3.0.12
spacy-loggers==1.0.5
SQLAlchemy==1.4.31
sqlalchemy-migrate==0.13.0
sqlparse==0.4.2
srsly==2.4.8
systemd-python==234
Tempita==0.5.2
tenacity==8.2.3
thinc==8.2.2
threadpoolctl==3.2.0
tiktoken==0.5.2
tqdm==4.66.1
typer==0.9.0
typing-inspect==0.9.0
typing_extensions==4.9.0
ubuntu-advantage-tools==8001
ubuntu-drivers-common==0.0.0
ufw==0.36.1
unattended-upgrades==0.1
urllib3==1.26.5
wadllib==1.3.6
wasabi==1.1.2
weasel==0.3.4
Werkzeug==3.0.1
xdg==5
xkit==0.0.0
yarl==1.9.4
zipp==1.0.0
zope.interface==6.1

View File

@@ -1,87 +0,0 @@
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
from openai import OpenAI
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector
from vectData import insert_data ,is_similar_data
import json
os.environ["OPENAI_API_KEY"] = "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"
client = OpenAI()
embeddings = OpenAIEmbeddings()
dlinks = ['https://klix.ba', 'https://srpskainfo.com', 'https://bljesak.info']
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
total_links = set()
collected_news = set()
def get_article_links(url, already_checked):
response = requests.get(url,headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')
link_store = []
for article in articles:
links = article.find_all('a', href=True)
for link in links:
link_value = urljoin(url, link['href'])
if link_value not in already_checked:
link_store.append(link_value)
already_checked.add(link_value)
return link_store
already_checked = set()
for dlink in dlinks:
temp_links = get_article_links(dlink, already_checked)
if temp_links:
total_links.update(temp_links)
final_links = {item for item in total_links if item}
for link in final_links:
response = requests.get(link,headers)
soup = BeautifulSoup(response.text, 'html.parser')
titles = soup.find_all(['h2', 'h1','h3'])
title_text = ' '.join([title.get_text(strip=True) for title in titles])
texts = soup.find_all(['p'])
text_text = ' '.join([text.get_text(strip=True) for text in texts])
try:
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
]
)
generated_text = completion.choices[0].message.content
response_data = json.loads(generated_text)
title = response_data["title"]
text = response_data["content"]
print("*********************************")
print(f"Title: {title}")
print("---------------------------------")
print(f"Content : {text}")
print("*********************************")
vector = embeddings.embed_query(generated_text)
if not is_similar_data(title, text, link, vector, threshold=0.9):
insert_data(title, text, link, vector)
except Exception as e:
print(f"Error in completion: {e}")
continue

Binary file not shown.

View File

@@ -0,0 +1,60 @@
import unittest
from unittest.mock import patch
import requests
from bs4 import BeautifulSoup
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector
from openai import OpenAI
import json
from dotenv import load_dotenv
from pyth.get_articles import get_article_links, insert_data, is_similar_data
import os
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI()
embeddings = OpenAIEmbeddings()
already_checked = set()
total_links = set()
collected_news = set()
dlinks = 'http://127.0.0.1:5000/'
class TestIntegration(unittest.TestCase):
def test_integration(self):
link = get_article_links(dlinks,already_checked)
self.assertEqual(len(already_checked), 2)
for link in total_links:
response = requests.get(link)
soup = BeautifulSoup(response.text, 'html.parser')
titles = soup.find_all(['h2', 'h1', 'h3'])
title_text = ' '.join([title.get_text(strip=True) for title in titles])
texts = soup.find_all(['p'])
text_text = ' '.join([text.get_text(strip=True) for text in texts])
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Data analytic, Journalist and News reporter"},
{"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."}
]
)
generated_text = completion.choices[0].message.content
response_data = json.loads(generated_text)
title = response_data["title"]
text = response_data["content"]
vector = embeddings.embed_query(generated_text)
self.assertIn("Test Title", title)
self.assertIn("Test Text", text)
self.assertEqual(len(total_links), 2)

View File

@@ -0,0 +1,89 @@
import unittest
import numpy as np
import psycopg2
import os
from pyth.db_management import calculate_cosine_similarity, is_similar_data, insert_data, get_data, create_db
class TestIntegration(unittest.TestCase):
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
dbname = os.getenv("DB_NAME")
@classmethod
def setUpClass(cls):
cls.host = os.getenv("DB_HOST")
cls.port = os.getenv("DB_PORT")
cls.user = os.getenv("DB_USER")
cls.password = os.getenv("DB_PASSWORD")
cls.dbname = os.getenv("DB_NAME")
cls.conn = psycopg2.connect(
host=cls.host,
port=cls.port,
user=cls.user,
password=cls.password,
dbname=cls.dbname
)
create_db(cls.conn)
@classmethod
def tearDownClass(cls):
cls.conn.close()
def setUp(self):
if self.conn.closed:
self.conn = psycopg2.connect(
host=self.host,
port=self.port,
user=self.user,
password=self.password,
dbname=self.dbname
)
self.cursor = self.conn.cursor()
def tearDown(self):
if not self.cursor.closed:
self.cursor.close()
if not self.conn.closed:
self.conn.close()
def test_insert_and_retrieve_data(self):
title = 'test_title'
text = 'test_text'
link = 'test_link'
embedding = np.arange(1, 1537)
insert_data(title, text, link, embedding)
data = get_data()
self.assertEqual(data, [(title, text, link)])
def test_is_similar_data_integration(self):
title = 'test_title'
text = 'test_text'
link = 'test_link'
embedding = np.arange(1, 1537)
insert_data(title, text, link, embedding)
result = is_similar_data(title, text, link, embedding)
self.assertTrue(result)
result = is_similar_data(title, text, link, embedding)
self.assertTrue(result)
result = is_similar_data(title, text, link, embedding)
self.assertTrue(result)
def test_create_db_integration(self):
cursor = self.conn.cursor()
cursor.execute("SELECT * FROM information_schema.tables WHERE table_name = 'vectorsvevijesti'")
table_exist = bool(cursor.fetchone())
self.assertTrue(table_exist)
if __name__ == '__main__':
unittest.main()

View File

@@ -1,115 +0,0 @@
import psycopg2
from psycopg2 import sql
from pgvector.psycopg2 import register_vector
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
host = 'localhost'
port = '5432'
user = 'postgres'
password = 'salmonela pljusti 221 hamo'
dbname = 'vector_svw'
def calculate_cosine_similarity(v1, v2):
v1_normalized = v1 / np.linalg.norm(v1)
v2_normalized = v2 / np.linalg.norm(v2)
similarity = cosine_similarity([v1_normalized], [v2_normalized])[0][0]
return similarity
def is_similar_data(title, text, link, embedding, threshold=0.9):
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
cursor = conn.cursor()
cursor.execute('SELECT title ,embedding FROM vectorsvevijesti;')
existing_embeddings = cursor.fetchall()
for existing_embedding_tuple in existing_embeddings:
existing_title = existing_embedding_tuple[0]
existing_embedding = np.array(existing_embedding_tuple[1]).flatten()
similarity = calculate_cosine_similarity(existing_embedding, embedding)
if similarity > threshold:
print(f"Similar data found: \n #{title} \n #{existing_title}")
cursor.close()
conn.close()
return True
print(f"Inserting: #{title}")
cursor.close()
conn.close()
return False
def insert_data(title, text, link, embedding):
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
cursor = conn.cursor()
cursor.execute('''
INSERT INTO vectorsvevijesti (title, text, link, embedding)
VALUES (%s, %s, %s, %s);
''', (title, text, link, embedding))
conn.commit()
cursor.close()
conn.close()
def get_data():
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
cursor = conn.cursor()
query = '''SELECT title,text,link FROM vectorsvevijesti;'''
cursor.execute(query)
data = cursor.fetchall()
cursor.close()
conn.close()
return data
def create_db():
conn = psycopg2.connect(
host=host,
port=port,
user=user,
password=password,
dbname=dbname
)
cursor = conn.cursor()
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
register_vector(conn)
cursor.execute("DROP TABLE IF EXISTS vectorsvevijesti;")
cursor.execute('''
CREATE TABLE vectorsvevijesti (
id bigserial PRIMARY KEY,
title VARCHAR,
text VARCHAR,
link VARCHAR,
embedding vector(1536)
);
''')
conn.commit()
cursor.close()
conn.close()
create_db()

4
run.sh Normal file
View File

@@ -0,0 +1,4 @@
#!/bin/bash
exec kill -9 $(lsof -t -i:8080)
exec go run cmd/web/web.go

0
server Executable file → Normal file
View File

0
spider Executable file → Normal file
View File

View File

@@ -1,16 +1,39 @@
{{define "articlesHTML"}}
<ol reversed>
{{range .articles}}
<li>
<div class="article_content">
<a href="/{{.ID}}/{{.Slug}}">
{{.Title}}</a></div>
<div class="timestamp">{{.SourceName}} - {{ .FormatedCreatedAt }}</div>
</li>
<br><br>
{{else}}
Nema članaka za izabrani datum.
{{end}}
</ol>
{{range .articles}}
<article class="news-article">
<div class="article_content">
<div class="ar-title">
<a href="/{{.ID}}/{{.Slug}}">
{{.Title}}
</a>
</div>
</div>
<a href="/{{.ID}}/{{.Slug}}">
<div class="prewi" data-content="{{.Content}}"></div>
</a>
<div class="timestamp"> starenovine - {{ .FormatedCreatedAt }} - {{.Category}}</div>
</article>
{{else}}
<div class="prewi">
Nema članaka za izabrani datum.
</div>
{{end}}
</div>
<script>
function createPewiev(content) {
let slicedContent = content.slice(0,200);
if (content.length > 200){
slicedContent += '...'
}
return slicedContent
}
let previewDivs = document.querySelectorAll('.prewi')
previewDivs.forEach(function(previewDiv){
let content = previewDiv.getAttribute('data-content')
previewDiv.textContent = createPewiev(content)
})
</script>
{{end}}

View File

@@ -0,0 +1,39 @@
{{define "articlecategoryHTML"}}
<h3 class="category">{{.title}}</h3>
{{range .articles}}
<article class="news-article">
<div class="article_content">
<div class="ar-title">
<a href="/{{.ID}}/{{.Slug}}">
{{.Title}}
</a>
</div>
</div>
<a href="/{{.ID}}/{{.Slug}}">
<div class="prewi" data-content="{{.Content}}" data-title="{{.Title}}" data-link="/{{.ID}}/{{.Slug}}"></div>
</a>
<div class="timestamp"> starenovine - {{ .FormatedCreatedAt }} - {{.Category}}</div>
</article>
{{else}}
<div class="prewi">
Nema članaka za izabrani datum.
</div>
{{end}}
<script>
function createPewiev(content) {
let slicedContent = content.slice(0,200);
if (content.length > 200){
slicedContent += '...'
}
return slicedContent
}
let previewDivs = document.querySelectorAll('.prewi')
previewDivs.forEach(function(previewDiv){
let content = previewDiv.getAttribute('data-content')
previewDiv.textContent = createPewiev(content)
})
</script>
{{end}}

View File

@@ -0,0 +1,52 @@
{{define "categorymenuHTML"}}
<nav class="hed">
<div id="small-menu" onclick="handleSmallMenu();myFunction(this)">
<div class="menu-icon">
<div class="bar1"></div>
<div class="bar2"></div>
<div class="bar3"></div>
</div>
<p>Menu</p>
</div>
<div class="menu">
<a href="/">
<div class="home-icon" title="Pocetna">
<div class="home-text">Pocetna</div>
<i class="fa fa-home" style="font-size:48px;color:white"></i>
</div>
</a>
{{range .categories}}
<a href="/{{ . }}">
<div class="home-icon" title="{{ . }}">
{{ . }}
</div>
</a>
{{end}}
</div>
</nav>
</div>
<script>
function handleSmallMenu (){
let menu = document.querySelector('.menu')
menu.classList.toggle('show-menu')
}
function myFunction(x) {
x.classList.toggle("change");
}
const handleScroll = function(event) {
const top = window.scrollY;
const header = document.querySelector('.hed');
const headerBottom = header.offsetTop + header.offsetHeight;
if (top >= headerBottom) {
header.classList.add('fixed');
} else {
header.classList.remove('fixed');
}
}
window.addEventListener('scroll', handleScroll);
</script>
{{end}}

View File

@@ -1,13 +1,11 @@
{{define "footerHTML"}}
<footer>
SN
<div>
<nav>
<a href="{{.previous}}">&lt;----</a> |
<a href="/">Početna</a> |
<a href="{{.next}}">----&gt;</a>
</nav>
</div>
<center>
<div class="fot">
<p>starenovine 2023-2024</p>
</div>
</center>
</footer>
{{end}}

23
web/data/fullweather.html Normal file
View File

@@ -0,0 +1,23 @@
{{define "fullweatherHTML"}}
<h2 class="w-title">{{.title}}</h2>
<div class="weather-container">
{{range .weatherInfo}}
<div class="weather-w">
<h3>{{.Name}}</h3>
{{with index .Weather 0}}
<div class="weather-info">Opis: {{.Description}}</div>
{{end}}
<div class="weather-info">Temperatura: {{.Main.Temp}}°C</div>
<div class="weather-info">Osecaj: {{.Main.FellsLike}}°C</div>
<div class="weather-info">Pritisak:{{.Main.Preassure}} hPa</div>
<div class="weather-info">Vlaznost: {{.Main.Humidity}}%</div>
<div class="weather-info">Min Temp: {{.Main.TempMin}}°C</div>
<div class="weather-info">Max Temp: {{.Main.TempMax}}°C</div>
<div class="weather-info">Vetar: {{.Wind.Speed}} m/s</div>
<div class="weather-info">Oblaci: {{.Clouds.All}}%</div>
</div>
{{end}}
</div>
{{end}}

View File

@@ -4,7 +4,7 @@
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=Edge">
<meta name="viewport" content="width=device-width,initial-scale=1">
<meta name="viewport" content="width=device-width,initial-scale=1.0">
<meta property="og:site_name" content="starenovine">
<meta name="twitter:card" content="preview">
<meta property="og:title" content="{{.title}}">
@@ -12,43 +12,468 @@
<meta property="og:url" content="https://www.starenovine.com">
<title>{{.title}} - stare novine</title>
<link rel="canonical" href="https://www.starenovine.com/">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
<style type="text/css">
body {
font-family: monospace;
font-size: 1.5em;
width: 90%;
max-width: 98vw;
}
:root {
--ease: cubic-bezier(.2, 1, .3, 1);
--icon-size: 60px;
--border-radius: 0.5;
--background: #2d2c3e;
--background-b: #2d2c3e;
--green: #16A085;
--white: #FFFFFF;
}
h1#title {
margin-block-end: 0;
font-size: 1.7em;
}
body {
font-size: 1.2em;
margin: 0 auto;
overflow-x: hidden;
background-color: black;
}
.category{
color: white;
width: 100%;
display: flex;
justify-content: center;
align-items: center;
}
.timestamp {
font-size: 0.8em;
color: gray;
}
h1#title {
background-color: #0B173B;
color: white;
margin-block-end: 0;
font-size: 1.7em;
padding-left: 5px;
}
.single_timestamp {
font-size: 0.77em;
margin-bottom: 0.7em;
color: gray;
}
.logo {
text-decoration: none;
color: black;
}
#logo {
font-size: 2vw;
background: white;
}
.timestamp {
display: flex;
justify-content: center;
align-items: center;
font-size: 0.8em;
color: gray;
}
pre.article_content {
background: white;
.prewi{
color: white;
padding-top: 5px;
padding-bottom: 2px;
margin-left: 5px;
}
.empty {
width: 89vw;
color: black;
box-shadow: 0 4px 8px #0000004d;
border: 1px solid black;
background: linear-gradient(90deg, rgba(231, 214, 197, 1) 0%, rgba(241, 234, 227, 1) 100%);
padding-top: 10px;
padding-bottom: 10px;
}
.ar-title {
background-color: #0B173B;
width: 100%;
color: black;
padding-top: 10px;
padding-bottom: 10px;
border-bottom: 3px solid black;
}
.w-title{
color:white;
}
.ar-title > a {
font-size: 1.2em;
font-weight: 500;
text-decoration: none;
color: white;
padding-left: 5px;
}
.article_content {
background-color: black;
gap: 4px;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.3);
}
.single_timestamp {
font-size: 0.77em;
margin-bottom: 0.7em;
color: gray;
padding-top: 10px;
padding-bottom: 10px;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.3);
}
#logo {
width: 100%;
font-size: 1.85vw;
background: linear-gradient(90deg, rgba(1, 1, 1, 1) 0%, rgba(11, 23, 59, 1) 50%, rgba(0, 0, 0, 1) 100%);
box-shadow: 2px 0 20px 6px rgba(11, 23, 59, 0.2);
color: white;
margin: 0 auto;
}
.list {
display: grid;
justify-content: center;
align-items: center;
list-style: none;
margin-top: 15px;
gap: 15px;
}
.news-article > a {
background: linear-gradient(90deg, rgba(231, 214, 197, 1) 0%, rgba(232, 232, 232, 1) 51%, rgba(231, 214, 197, 1) 100%);
color: black;
}
.news-article:hover .ar-title{
background-color: #FF0000;
}
.article_body {
color: white;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.3);
padding-left: 5px;
}
.fot {
display: flex;
background-color: #0B173B;
text-decoration: none;
color: white;
border-radius: 5px;
display: flex;
justify-content: center;
align-items: center;
padding-top: 5px;
padding-bottom: 5px;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.3);
gap: 5%;
}
.fot > a {
color: white;
gap: 10px;
text-decoration: none;
}
.hed {
background-color: #0B173B;
color: white;
border-radius: 5px;
display: flex;
justify-content: center;
align-items: center;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.3);
gap: 5%;
transition: background-color 0.5s var(--ease);
}
.hed.fixed {
position: fixed;
top: 0;
left: 0;
right: 0;
width: 100%;
z-index: 1000;
box-shadow: 0 4px 12px -4px rgba(255, 65, 54, 0.5);
}
.hed > a {
color: white;
gap: 10px;
text-decoration: none;
}
.article_content > a {
color: white;
text-decoration: none;
}
.news-article {
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.3);
border: 1px solid;
border-image: linear-gradient(90deg, rgba(8, 8, 8, 1) 0%, rgba(179, 150, 121, 1) 21%, rgba(0, 0, 0, 1) 37%, rgba(98, 87, 75, 1) 63%, rgba(0, 0, 0, 1) 100%);
border-image-slice: 1;
}
pre.article_content {
background: white;
}
.news-article > a {
text-decoration: none;
color: black;
}
#weather {
display: flex;
width: 100%;
height: fit-content;
justify-content: space-between;
}
.weather-container {
display: flex;
justify-content: space-around;
margin-top: 50px;
color: white;
}
.weather-container > div {
text-size-adjust: auto;
}
.weather-w {
border: 1px solid #ccc;
width: 20%;
text-align: center;
color: white;
background-color: #0B173B;
}
.weather-w h3 {
margin-top: 0;
color: white;
}
.weather-info {
margin-top: 10px;
color: white;
}
.w-link{
text-decoration: none;
}
.weather-widget {
width: 19%;
height: 50px;
display: grid;
align-items: center;
justify-content: center;
color: white;
background-color: #0B173B;
border: 1px solid;
border-image: linear-gradient(90deg, rgba(8, 8, 8, 1) 0%, rgba(179, 150, 121, 1) 21%, rgba(0, 0, 0, 1) 37%, rgba(98, 87, 75, 1) 63%, rgba(0, 0, 0, 1) 100%);
border-image-slice: 1;
box-shadow: 0 4px 12px -4px rgba(255, 65, 54, 0.5);
}
.weather-widget > div {
font-size: 10;
}
.weather-widget > div > span {
font-size: 8;
}
#small-menu{
display: none;
}
.menu{
width: 100%;
text-decoration: none;
color: white;
display: flex;
justify-content: space-evenly;
align-items: center;
padding-top: 5px;
padding-bottom: 5px;
}
.menu > a {
color: white;
gap: 10px;
text-decoration: none;
}
.home-text{
display: none;
}
@media only screen and (max-width: 600px) {
#small-menu{
display: flex;
margin-left: 20px;
align-items: center;
gap: 5px;
}
.fa-home{
display: none;
}
.home-text{
display: block;
}
.menu{
display: none;
}
.menu.show-menu{
width: 100%;
text-decoration: none;
color: white;
display: grid;
transition: background-color 0.5s var(--ease);
margin-left: 50px;
justify-content: left;
margin-top: 5px;
margin-bottom: 5px;
}
.hed{
height: fit-content;
display: grid;
gap: 0 ;
justify-content: left;
}
.menu.show-menu > a{
text-decoration: none;
border-image: linear-gradient(90deg, rgba(8, 8, 8, 1) 0%, rgba(179, 150, 121, 1) 21%, rgba(0, 0, 0, 1) 37%, rgba(98, 87, 75, 1) 63%, rgba(0, 0, 0, 1) 100%);
}
.home-icon {
display: flex;
justify-content: center;
align-items: center;
width: 10px;
height: 10px;
background-color: #0B173B;
border-radius: 50%;
cursor: pointer;
transition: background-color 0.5s var(--ease);
}
#weather {
display: grid;
margin-bottom: 4%;
}
.weather-widget {
width: 90vw;
height: 20px;
display: flex;
justify-content: space-between;
}
.weather-container {
display: grid;
color: white;
gap: 2%;
}
.weather-w{
width: 100vw;
}
}
html {
margin: 0 auto;
max-width: 98vw;
overflow-x: hidden;
margin: 0 auto;
width: 90%;
overflow-x: hidden;
}
.arr-pr-nx {
display: flex;
justify-content: space-between;
align-items: center;
}
.arr-pr-nx svg {
width: 40px;
height: 40px;
margin: 0 1rem;
cursor: pointer;
overflow: visible;
fill: white;
}
.arr-pr-nx svg polygon,
.arr-pr-nx path {
transition: all 0.5s var(--ease);
}
.arr-pr-nx svg:hover polygon,
.arr-pr-nx svg:hover path {
transition: all 1s var(--ease);
fill: #FF0000;
}
.arr-pr-nx svg:hover .arrow {
animation: arrow-anim 2.5s var(--ease) infinite;
}
.arr-pr-nx svg:hover .arrow-fixed {
animation: arrow-fixed-anim 2.5s var(--ease) infinite;
}
@keyframes arrow-anim {
0% {
opacity: 1;
transform: translateX(0);
}
5% {
transform: translateX(-0.1rem);
}
100% {
transform: translateX(1rem);
opacity: 0;
}
}
@keyframes arrow-fixed-anim {
5% {
opacity: 0;
}
20% {
opacity: 0.4;
}
100% {
opacity: 1;
}
}
.home-icon {
display: flex;
justify-content: center;
align-items: center;
width: 40px;
height: 40px;
background-color: #0B173B;
border-radius: 50%;
cursor: pointer;
transition: background-color 0.5s var(--ease);
}
.home-icon:hover {
background-color: #FF0000;
}
.home-icon i {
font-size: 2rem;
color: white;
transition: color 0.5s var(--ease);
}
.home-icon:hover i {
color: #FF0000;
}
.menu-icon {
display: inline-block;
cursor: pointer;
}
.bar1, .bar2, .bar3 {
width: 35px;
height: 5px;
background-color: white;
margin: 6px 0;
transition: 0.4s;
}
.change .bar1 {
transform: translate(0, 11px) rotate(-45deg);
}
.change .bar2 {opacity: 0;}
.change .bar3 {
transform: translate(0, -11px) rotate(45deg);
}
</style>

View File

@@ -1,5 +1,7 @@
{{define "headerHTML"}}
<header>
<center>
<a href="/" class="logo">
<pre id="logo">
_____ ______ ____ ____ ___ ____ ___ __ __ ____ ____ ___
/ ___/| | / || \ / _]| \ / \ | | || || \ / _]
@@ -10,11 +12,10 @@
\___| |__| |__|__||__|\_||_____||__|__| \___/ \_/ |____||__|__||_____|
</pre>
<br>
<nav>
<a href="{{.previous}}">&lt;----</a> |
<a href="/">Početna</a> |
<a href="{{.next}}">----&gt;</a>
</nav>
</a>
</center>
{{template "categorymenuHTML" .}}
</header>
{{end}}

27
web/data/prevnext.html Normal file
View File

@@ -0,0 +1,27 @@
{{define "prevnextHTML"}}
<nav class="fot">
<a href="{{.previous}}">
<div class="arr-pr-nx" title="Prethodna">
<svg width="18px" height="17px" viewBox="0 0 18 17" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="prev" transform="translate(8.500000, 8.500000) scale(-1, 1) translate(-8.500000, -8.500000)">
<polygon class="arrow" points="16.3746667 8.33860465 7.76133333 15.3067621 6.904 14.3175671 14.2906667 8.34246869 6.908 2.42790698 7.76 1.43613596"></polygon>
<polygon class="arrow-fixed" points="16.3746667 8.33860465 7.76133333 15.3067621 6.904 14.3175671 14.2906667 8.34246869 6.908 2.42790698 7.76 1.43613596"></polygon>
<path d="M-1.48029737e-15,0.56157424 L-1.48029737e-15,16.1929159 L9.708,8.33860465 L-2.66453526e-15,0.56157424 L-1.48029737e-15,0.56157424 Z M1.33333333,3.30246869 L7.62533333,8.34246869 L1.33333333,13.4327013 L1.33333333,3.30246869 L1.33333333,3.30246869 Z"></path>
</g>
</svg>
</div>
</a>
<a href="{{.next}}">
<div class="arr-pr-nx" title="Sledeca">
<svg width="18px" height="17px" viewBox="-1 0 18 17" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g>
<polygon class="arrow" points="16.3746667 8.33860465 7.76133333 15.3067621 6.904 14.3175671 14.2906667 8.34246869 6.908 2.42790698 7.76 1.43613596"></polygon>
<polygon class="arrow-fixed" points="16.3746667 8.33860465 7.76133333 15.3067621 6.904 14.3175671 14.2906667 8.34246869 6.908 2.42790698 7.76 1.43613596"></polygon>
<path d="M-4.58892184e-16,0.56157424 L-4.58892184e-16,16.1929159 L9.708,8.33860465 L-1.64313008e-15,0.56157424 L-4.58892184e-16,0.56157424 Z M1.33333333,3.30246869 L7.62533333,8.34246869 L1.33333333,13.4327013 L1.33333333,3.30246869 L1.33333333,3.30246869 Z"></path>
</g>
</svg>
</div>
</a>
</nav>
{{end}}

View File

@@ -2,7 +2,7 @@
{{with .article }}
<div class="article_content">
<h1 id="title">{{.Title}}</h1>
<div class="single_timestamp">{{.SourceName}} - {{ .FormatedCreatedAt }}</div>
<div class="single_timestamp"> starenovine - {{ .FormatedCreatedAt }}</div>
<div class="article_body">
{{.Content}}
</div>

View File

@@ -0,0 +1,14 @@
{{define "weatherwidgetHTML"}}
<br>
<a class="w-link" href="/weather">
<div id="weather">
{{range .weatherInfo}}
<div class="weather-widget">
<div><span id="city">{{.Name}}</span></div>
<div id="temperature">{{.Main.Temp}} °C</div>
</div>
{{end}}
</div>
</a>
<br>
{{end}}

View File

@@ -6,6 +6,8 @@
{{template "singleArticleHTML" .}}
<br>
{{template "prevnextHTML" .}}
{{template "footerHTML" .}}
</body>
</html>

12
web/tpl/category.html Normal file
View File

@@ -0,0 +1,12 @@
{{define "categoryHTML"}}
{{template "headHTML" .}}
<body>
{{template "headerHTML" .}}
{{template "articlecategoryHTML" .}}
<br>
{{template "footerHTML" .}}
</body>
</html>
{{end}}

View File

@@ -4,8 +4,11 @@
<body>
{{template "headerHTML" .}}
{{template "articlesHTML" .}}
{{template "weatherwidgetHTML"}}
{{template "articlesHTML" .}}
<br>
{{template "prevnextHTML" .}}
{{template "footerHTML" .}}
</body>
</html>

View File

@@ -1,11 +1,13 @@
{{define "homeHTML"}}
{{template "headHTML" .}}
<body>
{{template "headerHTML" .}}
{{template "articlesHTML" .}}
{{template "weatherwidgetHTML" .}}
{{template "articlesHTML" .}}
<br>
{{template "prevnextHTML" .}}
{{template "footerHTML" .}}
</body>
</html>

12
web/tpl/weather.html Normal file
View File

@@ -0,0 +1,12 @@
{{define "weatherHTML"}}
{{template "headHTML" .}}
<body>
{{template "headerHTML" .}}
{{template "fullweatherHTML" .}}
{{template "footerHTML" .}}
</body>
</html>
{{end}}