This commit is contained in:
2023-12-18 16:51:47 +01:00
commit 88741b2303
36 changed files with 1490 additions and 0 deletions

View File

@@ -0,0 +1,188 @@
package database
import (
"fmt"
_ "github.com/lib/pq"
"gitlab.com/kbr4/svevijesti/internal/model"
"html/template"
"math"
"strings"
"time"
)
func InsertArticle(store *Store, article model.ScrapedArticle) (err error) {
query := `
INSERT INTO articles
(title, content, slug, original_url, source_id)
VALUES
($1,$2,$3,$4,$5);`
_, err = store.Exec(query, article.Title, article.Content, article.Slug, article.OriginalUrl, article.SourceId)
if err != nil {
return err
}
return nil
}
func IsSaved(store *Store, url string) bool {
exists := false
query, err := store.Prepare(`
select exists(select 1 from articles where original_url = $1);
`)
if err != nil {
panic(err)
}
defer query.Close()
row := query.QueryRow(url)
err = row.Scan(&exists)
if err != nil {
panic(err)
}
return exists
}
func ArticlesForDay(store *Store, day time.Time) (articles []model.DisplayArticle, err error) {
result := []model.DisplayArticle{}
query, err := store.Prepare(`
select id,title, content, slug, original_url, source_id, created_at from articles where created_at > $1 and created_at < $2 and LENGTH(content) > 10 order by id desc;
`)
if err != nil {
return result, err
}
defer query.Close()
tomorrow := day.AddDate(0, 0, 1)
todayDate := day.Format("2006-01-02")
tomorrowDate := tomorrow.Format("2006-01-02")
rows, err := query.Query(todayDate, tomorrowDate)
if err != nil {
return result, err
}
defer rows.Close()
for rows.Next() {
r := model.DisplayArticle{}
err = rows.Scan(&r.ID, &r.Title, &r.Content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt)
if err != nil {
return result, err
}
ago := time.Now().Sub(r.CreatedAt)
hours := ago.Hours()
if hours < 1 {
r.FormatedCreatedAt = fmt.Sprintf("Prije %d minuta.", int(math.Floor(ago.Minutes())))
} else if hours > 24 {
r.FormatedCreatedAt = r.CreatedAt.Format("02.01.2006. 15:04:05")
} else {
r.FormatedCreatedAt = fmt.Sprintf("Prije %d sati.", int(math.Floor(hours)))
}
r.SourceName = model.SourceName(r.SourceId)
result = append(result, r)
}
return result, nil
}
func ArticleByID(store *Store, ID int, slug string) (article model.DisplayArticle, err error) {
result := model.DisplayArticle{}
query, err := store.Prepare(`
select id,title, content, slug, original_url, source_id, created_at from articles where id = $1 and slug = $2;
`)
if err != nil {
return result, err
}
defer query.Close()
row := query.QueryRow(ID, slug)
if err != nil {
return result, err
}
r := model.DisplayArticle{}
content := ""
err = row.Scan(&r.ID, &r.Title, &content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt)
if err != nil {
return result, err
}
ago := time.Now().Sub(r.CreatedAt)
hours := ago.Hours()
r.Content = template.HTML(strings.Replace(content, "\n", "<br>\n", -1))
if hours < 1 {
r.FormatedCreatedAt = fmt.Sprintf("Prije %d minuta.", int(math.Floor(ago.Minutes())))
} else if hours > 24 {
r.FormatedCreatedAt = r.CreatedAt.Format("02.01.2006. 15:04:05")
} else {
r.FormatedCreatedAt = fmt.Sprintf("Prije %d sati.", int(math.Floor(hours)))
}
r.SourceName = model.SourceName(r.SourceId)
result = r
return result, nil
}
func PreviousAndNextArticleUrlByID(store *Store, ID int) (nextUrl string, previousUrl string, err error) {
nextResult, previousResult := "#", "#"
query, err := store.Prepare(`
select id,title, content, slug, original_url, source_id, created_at from articles where id < $1 and id > $2 order by id desc limit 1;
`)
if err != nil {
fmt.Println("Err 1:", err)
return nextResult, previousResult, err
}
defer query.Close()
row := query.QueryRow(ID, 0)
if err != nil {
fmt.Println("Err 2:", err)
return nextResult, previousResult, err
}
r := model.DisplayArticle{}
content := ""
err = row.Scan(&r.ID, &r.Title, &content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt)
if err != nil {
return nextResult, previousResult, err
}
previousResult = fmt.Sprintf("/%d/%s", r.ID, r.Slug)
query2, err := store.Prepare(`
select id,title, content, slug, original_url, source_id, created_at from articles where id < $1 and id > $2 order by id asc limit 1;
`)
if err != nil {
fmt.Println("Err 1:", err)
return nextResult, previousResult, err
}
defer query2.Close()
row = query2.QueryRow(ID+1000, ID)
if err != nil {
fmt.Println("Err 3:", err)
return nextResult, previousResult, err
}
content = ""
err = row.Scan(&r.ID, &r.Title, &content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt)
if err != nil {
fmt.Println("Err 4:", err)
return nextResult, previousResult, err
}
nextResult = fmt.Sprintf("/%d/%s", r.ID, r.Slug)
return nextResult, previousResult, nil
}

View File

@@ -0,0 +1,25 @@
package database
import (
"database/sql"
"fmt"
_ "github.com/lib/pq"
)
const (
host = "localhost"
port = 5432
user = "svevijesti"
password = "salmonela pljusti 221 hamo"
dbname = "svevijestiweb"
)
type Store = sql.DB
func Connect() (*Store, error) {
psqlInfo := fmt.Sprintf("host=%s port=%d user=%s "+
"password='%s' dbname=%s sslmode=disable",
host, port, user, password, dbname)
db, err := sql.Open("postgres", psqlInfo)
return db, err
}

51
internal/model/model.go Normal file
View File

@@ -0,0 +1,51 @@
package model
import (
"html/template"
"time"
)
type ScrapedArticle struct {
Title string
Content string
Slug string
OriginalUrl string
SourceId int
}
type DisplayArticle struct {
ID int
Title string
Content template.HTML
Slug string
OriginalUrl string
SourceId int
CreatedAt time.Time
FormatedCreatedAt string
SourceName string
}
const (
KlixSource = 1
SrpskainfoSource = 2
BljesakSource = 3
AvazSource = 4
)
func SourceName(sourceId int) string {
switch sourceId {
case KlixSource:
return "klix"
case SrpskainfoSource:
return "srpskainfo"
case BljesakSource:
return "bljesak"
case AvazSource:
return "avaz"
}
return "starenovine"
}
const (
Terminator = "TERMINATED"
)

104
internal/scraper/avaz.go Normal file
View File

@@ -0,0 +1,104 @@
package scraper
import (
"fmt"
"github.com/gocolly/colly"
"github.com/gosimple/slug"
"gitlab.com/kbr4/svevijesti/internal/model"
"math/rand"
"regexp"
"strings"
"time"
)
var AvazArticles = make(chan model.ScrapedArticle)
var AvazCandidates = make(chan string)
var AvazApprovedSites = make(chan string, 2)
func CrawlAvaz() {
crHomePage := colly.NewCollector(colly.AllowedDomains("avaz.ba"))
crArticlePage := colly.NewCollector(colly.AllowedDomains("avaz.ba"))
setupAvazArticlePageCrawler(crArticlePage)
setupAvazHomepageCrawler(crHomePage, crArticlePage)
go visitAvazApprovedPages(crArticlePage)
}
func visitAvazApprovedPages(crArticlePage *colly.Collector) {
fmt.Println("Consuming sites!")
for url := range AvazApprovedSites {
fmt.Println("Visiting: ", url)
crArticlePage.Visit(url)
}
}
func setupAvazHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
articleUrlR, _ := regexp.Compile("/\\d\\d+/([a-z0-9-]+)")
articleBlacklist, _ := regexp.Compile("(english)")
crHomePage.OnHTML("a", func(e *colly.HTMLElement) {
url := e.Attr("href")
completeUrl := url
if articleUrlR.MatchString(url) && !articleBlacklist.MatchString(url) {
AvazCandidates <- completeUrl
}
})
crHomePage.OnScraped(func(_ *colly.Response) {
time.Sleep(5 * time.Second)
terminating := model.ScrapedArticle{}
terminating.Title = model.Terminator
AvazArticles <- terminating
AvazApprovedSites <- model.Terminator
AvazCandidates <- model.Terminator
})
crHomePage.OnError(func(_ *colly.Response, _ error) {
time.Sleep(5 * time.Second)
terminating := model.ScrapedArticle{}
terminating.Title = model.Terminator
AvazArticles <- terminating
AvazApprovedSites <- model.Terminator
AvazCandidates <- model.Terminator
})
go crHomePage.Visit("https://avaz.ba")
}
func setupAvazArticlePageCrawler(crArticlePage *colly.Collector) {
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
url := e.Request.URL.String()
title := ""
e.ForEachWithBreak("h1.title, h3.title", func(_ int, el *colly.HTMLElement) bool {
title = el.Text
return false
})
text := ""
e.ForEach("p.podtitle, div.artikal-text", func(_ int, el *colly.HTMLElement) {
text += extractJustText(el.DOM)
})
article := model.ScrapedArticle{}
trimmedText := strings.TrimSpace(text)
article.OriginalUrl = url
article.Title = title
article.Content = trimmedText
article.SourceId = model.AvazSource
slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title)
article.Slug = slug.Make(slugBase)
AvazArticles <- article
})
crArticlePage.OnError(func(_ *colly.Response, err error) {
fmt.Println("Problem crawling!", err)
})
}

View File

@@ -0,0 +1,104 @@
package scraper
import (
"fmt"
"github.com/gocolly/colly"
"github.com/gosimple/slug"
"gitlab.com/kbr4/svevijesti/internal/model"
"math/rand"
"regexp"
"strings"
"time"
)
var BljesakArticles = make(chan model.ScrapedArticle)
var BljesakCandidates = make(chan string)
var BljesakApprovedSites = make(chan string, 2)
func CrawlBljesak() {
crHomePage := colly.NewCollector(colly.AllowedDomains("bljesak.info"))
crArticlePage := colly.NewCollector(colly.AllowedDomains("bljesak.info"))
setupBljesakArticlePageCrawler(crArticlePage)
setupBljesakHomepageCrawler(crHomePage, crArticlePage)
go visitBljesakApprovedPages(crArticlePage)
}
func visitBljesakApprovedPages(crArticlePage *colly.Collector) {
fmt.Println("Consuming sites!")
for url := range BljesakApprovedSites {
fmt.Println("Visiting: ", url)
crArticlePage.Visit(url)
}
}
func setupBljesakHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
articleUrlR, _ := regexp.Compile("\\d\\d+$")
articleBlacklist, _ := regexp.Compile("(info-vodic|foto-data)")
crHomePage.OnHTML("a", func(e *colly.HTMLElement) {
url := e.Attr("href")
completeUrl := url
if articleUrlR.MatchString(url) && !articleBlacklist.MatchString(url) {
BljesakCandidates <- completeUrl
}
})
crHomePage.OnScraped(func(_ *colly.Response) {
time.Sleep(5 * time.Second)
terminating := model.ScrapedArticle{}
terminating.Title = model.Terminator
BljesakArticles <- terminating
BljesakApprovedSites <- model.Terminator
BljesakCandidates <- model.Terminator
})
crHomePage.OnError(func(_ *colly.Response, _ error) {
time.Sleep(5 * time.Second)
terminating := model.ScrapedArticle{}
terminating.Title = model.Terminator
BljesakArticles <- terminating
BljesakApprovedSites <- model.Terminator
BljesakCandidates <- model.Terminator
})
go crHomePage.Visit("https://bljesak.info")
}
func setupBljesakArticlePageCrawler(crArticlePage *colly.Collector) {
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
url := e.Request.URL.String()
title := ""
e.ForEachWithBreak("h1.title, h3.title", func(_ int, el *colly.HTMLElement) bool {
title = el.Text
return false
})
text := ""
e.ForEach("div.intro, div#infiniteLoadBreakpoint", func(_ int, el *colly.HTMLElement) {
text += extractJustText(el.DOM)
})
article := model.ScrapedArticle{}
trimmedText := strings.TrimSpace(text)
article.OriginalUrl = url
article.Title = title
article.Content = trimmedText
article.SourceId = model.BljesakSource
slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title)
article.Slug = slug.Make(slugBase)
BljesakArticles <- article
})
crArticlePage.OnError(func(_ *colly.Response, err error) {
fmt.Println("Problem crawling!", err)
})
}

103
internal/scraper/klix.go Normal file
View File

@@ -0,0 +1,103 @@
package scraper
import (
"fmt"
"github.com/gocolly/colly"
"github.com/gosimple/slug"
"gitlab.com/kbr4/svevijesti/internal/model"
"math/rand"
"regexp"
"strings"
"time"
)
var KlixArticles = make(chan model.ScrapedArticle)
var KlixCandidates = make(chan string)
var KlixApprovedSites = make(chan string, 2)
func CrawlKlix() {
crHomePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
crArticlePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
setupKlArticlePageCrawler(crArticlePage)
setupKlHomepageCrawler(crHomePage, crArticlePage)
go visitApprovedPages(crArticlePage)
}
func visitApprovedPages(crArticlePage *colly.Collector) {
fmt.Println("Consuming sites!")
for url := range KlixApprovedSites {
fmt.Println("Visiting: ", url)
crArticlePage.Visit(url)
}
}
func setupKlHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
articleUrlR, _ := regexp.Compile("\\d\\d+$")
crHomePage.OnHTML(".container a", func(e *colly.HTMLElement) {
url := e.Attr("href")
completeUrl := "https://www.klix.ba" + url
if articleUrlR.MatchString(url) {
KlixCandidates <- completeUrl
}
})
crHomePage.OnScraped(func(_ *colly.Response) {
time.Sleep(5 * time.Second)
terminating := model.ScrapedArticle{}
terminating.Title = model.Terminator
KlixArticles <- terminating
KlixApprovedSites <- model.Terminator
KlixCandidates <- model.Terminator
})
crHomePage.OnError(func(_ *colly.Response, _ error) {
time.Sleep(5 * time.Second)
terminating := model.ScrapedArticle{}
terminating.Title = model.Terminator
KlixArticles <- terminating
KlixApprovedSites <- model.Terminator
KlixCandidates <- model.Terminator
})
go crHomePage.Visit("https://www.klix.ba")
}
func setupKlArticlePageCrawler(crArticlePage *colly.Collector) {
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
url := e.Request.URL.String()
title := ""
e.ForEachWithBreak("title", func(_ int, el *colly.HTMLElement) bool {
title = el.Text
return false
})
text := ""
e.ForEach("div#text, p.lead", func(_ int, el *colly.HTMLElement) {
text += extractJustText(el.DOM)
})
article := model.ScrapedArticle{}
trimmedText := strings.TrimSpace(text)
article.OriginalUrl = url
article.Title = title
article.Content = trimmedText
article.SourceId = model.KlixSource
slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title)
article.Slug = slug.Make(slugBase)
KlixArticles <- article
})
crArticlePage.OnError(func(_ *colly.Response, _ error) {
fmt.Println("Problem crawling!")
})
}

View File

@@ -0,0 +1,26 @@
package scraper
import (
"github.com/PuerkitoBio/goquery"
)
func extractJustText(el *goquery.Selection) string {
textPart := ""
htmlPart, _ := el.Html()
if len(el.Nodes) == 0 {
return ""
}
//fmt.Println("Checking: ", htmlPart, "Duzina: ", strconv.Itoa(len(el.Nodes)), " Type je ", el.Nodes[0].Type, " jednakost ", el.Text() == htmlPart)
if el.Text() == htmlPart {
return el.Text() + "\n"
}
el.Children().Each(func(_ int, el2 *goquery.Selection) {
if el2.Is("div, p, span, a") {
textPart += extractJustText(el2)
}
})
return textPart
}

View File

@@ -0,0 +1,103 @@
package scraper
import (
"fmt"
"github.com/gocolly/colly"
"github.com/gosimple/slug"
"gitlab.com/kbr4/svevijesti/internal/model"
"math/rand"
"regexp"
"strings"
"time"
)
var SrpskainfoArticles = make(chan model.ScrapedArticle)
var SrpskainfoCandidates = make(chan string)
var SrpskainfoApprovedSites = make(chan string, 2)
func CrawlSrpskainfo() {
crHomePage := colly.NewCollector(colly.AllowedDomains("srpskainfo.com"))
crArticlePage := colly.NewCollector(colly.AllowedDomains("srpskainfo.com"))
setupSiArticlePageCrawler(crArticlePage)
setupSiHomepageCrawler(crHomePage, crArticlePage)
go visitSiApprovedPages(crArticlePage)
}
func visitSiApprovedPages(crArticlePage *colly.Collector) {
fmt.Println("Consuming sites!")
for url := range SrpskainfoApprovedSites {
fmt.Println("Visiting: ", url)
crArticlePage.Visit(url)
}
}
func setupSiHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
crHomePage.OnHTML("a", func(e *colly.HTMLElement) {
articleUrlR, _ := regexp.Compile("([A-Za-z0-9]+-){3,}([A-Za-z0-9]+)/$")
url := e.Attr("href")
completeUrl := url
if articleUrlR.MatchString(url) {
SrpskainfoCandidates <- completeUrl
}
})
crHomePage.OnScraped(func(_ *colly.Response) {
time.Sleep(5 * time.Second)
terminating := model.ScrapedArticle{}
terminating.Title = model.Terminator
SrpskainfoArticles <- terminating
SrpskainfoApprovedSites <- model.Terminator
SrpskainfoCandidates <- model.Terminator
})
crHomePage.OnError(func(_ *colly.Response, _ error) {
time.Sleep(5 * time.Second)
terminating := model.ScrapedArticle{}
terminating.Title = model.Terminator
SrpskainfoArticles <- terminating
SrpskainfoApprovedSites <- model.Terminator
SrpskainfoCandidates <- model.Terminator
})
go crHomePage.Visit("https://srpskainfo.com")
}
func setupSiArticlePageCrawler(crArticlePage *colly.Collector) {
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
url := e.Request.URL.String()
title := ""
e.ForEachWithBreak("h1", func(_ int, el *colly.HTMLElement) bool {
title = el.Text
return false
})
text := ""
e.ForEach("div.article__top-content, div.article__content", func(_ int, el *colly.HTMLElement) {
text += extractJustText(el.DOM)
})
article := model.ScrapedArticle{}
trimmedText := strings.TrimSpace(text)
article.OriginalUrl = url
article.Title = title
article.Content = trimmedText
article.SourceId = model.SrpskainfoSource
slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title)
article.Slug = slug.Make(slugBase)
SrpskainfoArticles <- article
})
crArticlePage.OnError(func(_ *colly.Response, _ error) {
fmt.Println("Problem crawling!")
})
}

110
internal/server/articles.go Normal file
View File

@@ -0,0 +1,110 @@
package server
import (
"fmt"
"github.com/gorilla/mux"
"gitlab.com/kbr4/svevijesti/internal/database"
"net/http"
"strconv"
"time"
)
func rootHandler(wr http.ResponseWriter, req *http.Request) {
title := "Pocetna"
store, err := database.Connect()
if err != nil {
http.Error(wr, err.Error(), http.StatusInternalServerError)
}
defer store.Close()
articles, err := database.ArticlesForDay(store, time.Now())
if err != nil {
http.Error(wr, err.Error(), http.StatusInternalServerError)
}
dayBefore := "/dan/" + time.Now().Add(-24*time.Hour).Format("2006-01-02")
data := map[string]interface{}{
"title": title,
"articles": articles,
"previous": dayBefore,
"next": "/",
}
err = templates.ExecuteTemplate(wr, "homeHTML", data)
if err != nil {
http.Error(wr, err.Error(), http.StatusInternalServerError)
}
}
func dailyArticlesHandler(wr http.ResponseWriter, req *http.Request) {
vars := mux.Vars(req)
day, err := time.Parse("2006-01-02", vars["date"])
if err != nil {
http.Error(wr, err.Error(), http.StatusNotFound)
}
dayBefore := "/dan/" + day.Add(-24*time.Hour).Format("2006-01-02")
dayAfter := "/dan/" + day.Add(24*time.Hour).Format("2006-01-02")
if day.Add(24*time.Hour).Format("2006-01-02") == time.Now().Format("2006-01-02") {
dayAfter = "/"
}
title := fmt.Sprintf("Stare novine na dan %s", day.Format("2006-01-02"))
store, err := database.Connect()
if err != nil {
http.Error(wr, err.Error(), http.StatusInternalServerError)
}
defer store.Close()
articles, err := database.ArticlesForDay(store, day)
if err != nil {
http.Error(wr, err.Error(), http.StatusInternalServerError)
}
data := map[string]interface{}{
"title": title,
"articles": articles,
"previous": dayBefore,
"next": dayAfter,
}
err = templates.ExecuteTemplate(wr, "homeHTML", data)
if err != nil {
http.Error(wr, err.Error(), http.StatusInternalServerError)
}
}
func articleHandler(wr http.ResponseWriter, req *http.Request) {
store, err := database.Connect()
if err != nil {
http.Error(wr, err.Error(), http.StatusInternalServerError)
}
defer store.Close()
vars := mux.Vars(req)
articleID, err := strconv.Atoi(vars["id"])
if err != nil {
articleID = -1
}
articleSlug := vars["slug"]
article, err := database.ArticleByID(store, articleID, articleSlug)
if err != nil {
http.Error(wr, err.Error(), http.StatusNotFound)
}
next, previous, _ := database.PreviousAndNextArticleUrlByID(store, articleID)
title := article.Title
data := map[string]interface{}{
"title": title,
"article": article,
"previous": previous,
"next": next,
}
err = templates.ExecuteTemplate(wr, "articleHTML", data)
if err != nil {
http.Error(wr, err.Error(), http.StatusInternalServerError)
}
}

46
internal/server/server.go Normal file
View File

@@ -0,0 +1,46 @@
package server
import (
"fmt"
"github.com/gorilla/mux"
"html/template"
"io/ioutil"
"path/filepath"
"strings"
)
var tPath = "./web/tpl/"
var dPath = "./web/data/"
var templateDirs = []string{"./web/tpl", "./web/data"}
var templates *template.Template
func getTemplates() (templates *template.Template, err error) {
var allFiles []string
for _, dir := range templateDirs {
files2, _ := ioutil.ReadDir(dir)
for _, file := range files2 {
filename := file.Name()
if strings.HasSuffix(filename, ".html") {
filePath := filepath.Join(dir, filename)
fmt.Println("Template found: ", filePath)
allFiles = append(allFiles, filePath)
}
}
}
templates, err = template.New("").ParseFiles(allFiles...)
return
}
func init() {
templates, _ = getTemplates()
}
func CreateRoutes() *mux.Router {
r := mux.NewRouter()
r.HandleFunc("/dan/{date}", dailyArticlesHandler)
r.HandleFunc("/{id:[0-9]+}/{slug}", articleHandler)
r.HandleFunc("/", rootHandler)
return r
}