Fresh
This commit is contained in:
188
internal/database/articles.go
Normal file
188
internal/database/articles.go
Normal file
@@ -0,0 +1,188 @@
|
||||
package database
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
_ "github.com/lib/pq"
|
||||
"gitlab.com/kbr4/svevijesti/internal/model"
|
||||
"html/template"
|
||||
"math"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
func InsertArticle(store *Store, article model.ScrapedArticle) (err error) {
|
||||
query := `
|
||||
INSERT INTO articles
|
||||
(title, content, slug, original_url, source_id)
|
||||
VALUES
|
||||
($1,$2,$3,$4,$5);`
|
||||
|
||||
_, err = store.Exec(query, article.Title, article.Content, article.Slug, article.OriginalUrl, article.SourceId)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func IsSaved(store *Store, url string) bool {
|
||||
|
||||
exists := false
|
||||
query, err := store.Prepare(`
|
||||
select exists(select 1 from articles where original_url = $1);
|
||||
`)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer query.Close()
|
||||
|
||||
row := query.QueryRow(url)
|
||||
err = row.Scan(&exists)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return exists
|
||||
}
|
||||
|
||||
func ArticlesForDay(store *Store, day time.Time) (articles []model.DisplayArticle, err error) {
|
||||
|
||||
result := []model.DisplayArticle{}
|
||||
query, err := store.Prepare(`
|
||||
select id,title, content, slug, original_url, source_id, created_at from articles where created_at > $1 and created_at < $2 and LENGTH(content) > 10 order by id desc;
|
||||
`)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
defer query.Close()
|
||||
|
||||
tomorrow := day.AddDate(0, 0, 1)
|
||||
todayDate := day.Format("2006-01-02")
|
||||
tomorrowDate := tomorrow.Format("2006-01-02")
|
||||
|
||||
rows, err := query.Query(todayDate, tomorrowDate)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
for rows.Next() {
|
||||
r := model.DisplayArticle{}
|
||||
err = rows.Scan(&r.ID, &r.Title, &r.Content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
|
||||
ago := time.Now().Sub(r.CreatedAt)
|
||||
hours := ago.Hours()
|
||||
|
||||
if hours < 1 {
|
||||
r.FormatedCreatedAt = fmt.Sprintf("Prije %d minuta.", int(math.Floor(ago.Minutes())))
|
||||
|
||||
} else if hours > 24 {
|
||||
r.FormatedCreatedAt = r.CreatedAt.Format("02.01.2006. 15:04:05")
|
||||
} else {
|
||||
r.FormatedCreatedAt = fmt.Sprintf("Prije %d sati.", int(math.Floor(hours)))
|
||||
}
|
||||
r.SourceName = model.SourceName(r.SourceId)
|
||||
|
||||
result = append(result, r)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func ArticleByID(store *Store, ID int, slug string) (article model.DisplayArticle, err error) {
|
||||
|
||||
result := model.DisplayArticle{}
|
||||
query, err := store.Prepare(`
|
||||
select id,title, content, slug, original_url, source_id, created_at from articles where id = $1 and slug = $2;
|
||||
`)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
defer query.Close()
|
||||
|
||||
row := query.QueryRow(ID, slug)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
|
||||
r := model.DisplayArticle{}
|
||||
content := ""
|
||||
err = row.Scan(&r.ID, &r.Title, &content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
|
||||
ago := time.Now().Sub(r.CreatedAt)
|
||||
hours := ago.Hours()
|
||||
|
||||
r.Content = template.HTML(strings.Replace(content, "\n", "<br>\n", -1))
|
||||
|
||||
if hours < 1 {
|
||||
r.FormatedCreatedAt = fmt.Sprintf("Prije %d minuta.", int(math.Floor(ago.Minutes())))
|
||||
|
||||
} else if hours > 24 {
|
||||
r.FormatedCreatedAt = r.CreatedAt.Format("02.01.2006. 15:04:05")
|
||||
} else {
|
||||
r.FormatedCreatedAt = fmt.Sprintf("Prije %d sati.", int(math.Floor(hours)))
|
||||
}
|
||||
r.SourceName = model.SourceName(r.SourceId)
|
||||
|
||||
result = r
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func PreviousAndNextArticleUrlByID(store *Store, ID int) (nextUrl string, previousUrl string, err error) {
|
||||
|
||||
nextResult, previousResult := "#", "#"
|
||||
query, err := store.Prepare(`
|
||||
select id,title, content, slug, original_url, source_id, created_at from articles where id < $1 and id > $2 order by id desc limit 1;
|
||||
`)
|
||||
if err != nil {
|
||||
fmt.Println("Err 1:", err)
|
||||
return nextResult, previousResult, err
|
||||
}
|
||||
defer query.Close()
|
||||
|
||||
row := query.QueryRow(ID, 0)
|
||||
if err != nil {
|
||||
fmt.Println("Err 2:", err)
|
||||
return nextResult, previousResult, err
|
||||
}
|
||||
|
||||
r := model.DisplayArticle{}
|
||||
content := ""
|
||||
err = row.Scan(&r.ID, &r.Title, &content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt)
|
||||
if err != nil {
|
||||
return nextResult, previousResult, err
|
||||
}
|
||||
|
||||
previousResult = fmt.Sprintf("/%d/%s", r.ID, r.Slug)
|
||||
|
||||
query2, err := store.Prepare(`
|
||||
select id,title, content, slug, original_url, source_id, created_at from articles where id < $1 and id > $2 order by id asc limit 1;
|
||||
`)
|
||||
if err != nil {
|
||||
fmt.Println("Err 1:", err)
|
||||
return nextResult, previousResult, err
|
||||
}
|
||||
defer query2.Close()
|
||||
|
||||
row = query2.QueryRow(ID+1000, ID)
|
||||
if err != nil {
|
||||
fmt.Println("Err 3:", err)
|
||||
return nextResult, previousResult, err
|
||||
}
|
||||
|
||||
content = ""
|
||||
err = row.Scan(&r.ID, &r.Title, &content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt)
|
||||
if err != nil {
|
||||
fmt.Println("Err 4:", err)
|
||||
return nextResult, previousResult, err
|
||||
}
|
||||
nextResult = fmt.Sprintf("/%d/%s", r.ID, r.Slug)
|
||||
|
||||
return nextResult, previousResult, nil
|
||||
}
|
||||
25
internal/database/database.go
Normal file
25
internal/database/database.go
Normal file
@@ -0,0 +1,25 @@
|
||||
package database
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
_ "github.com/lib/pq"
|
||||
)
|
||||
|
||||
const (
|
||||
host = "localhost"
|
||||
port = 5432
|
||||
user = "svevijesti"
|
||||
password = "salmonela pljusti 221 hamo"
|
||||
dbname = "svevijestiweb"
|
||||
)
|
||||
|
||||
type Store = sql.DB
|
||||
|
||||
func Connect() (*Store, error) {
|
||||
psqlInfo := fmt.Sprintf("host=%s port=%d user=%s "+
|
||||
"password='%s' dbname=%s sslmode=disable",
|
||||
host, port, user, password, dbname)
|
||||
db, err := sql.Open("postgres", psqlInfo)
|
||||
return db, err
|
||||
}
|
||||
51
internal/model/model.go
Normal file
51
internal/model/model.go
Normal file
@@ -0,0 +1,51 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"html/template"
|
||||
"time"
|
||||
)
|
||||
|
||||
type ScrapedArticle struct {
|
||||
Title string
|
||||
Content string
|
||||
Slug string
|
||||
OriginalUrl string
|
||||
SourceId int
|
||||
}
|
||||
|
||||
type DisplayArticle struct {
|
||||
ID int
|
||||
Title string
|
||||
Content template.HTML
|
||||
Slug string
|
||||
OriginalUrl string
|
||||
SourceId int
|
||||
CreatedAt time.Time
|
||||
FormatedCreatedAt string
|
||||
SourceName string
|
||||
}
|
||||
|
||||
const (
|
||||
KlixSource = 1
|
||||
SrpskainfoSource = 2
|
||||
BljesakSource = 3
|
||||
AvazSource = 4
|
||||
)
|
||||
|
||||
func SourceName(sourceId int) string {
|
||||
switch sourceId {
|
||||
case KlixSource:
|
||||
return "klix"
|
||||
case SrpskainfoSource:
|
||||
return "srpskainfo"
|
||||
case BljesakSource:
|
||||
return "bljesak"
|
||||
case AvazSource:
|
||||
return "avaz"
|
||||
}
|
||||
return "starenovine"
|
||||
}
|
||||
|
||||
const (
|
||||
Terminator = "TERMINATED"
|
||||
)
|
||||
104
internal/scraper/avaz.go
Normal file
104
internal/scraper/avaz.go
Normal file
@@ -0,0 +1,104 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/gocolly/colly"
|
||||
"github.com/gosimple/slug"
|
||||
"gitlab.com/kbr4/svevijesti/internal/model"
|
||||
"math/rand"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var AvazArticles = make(chan model.ScrapedArticle)
|
||||
var AvazCandidates = make(chan string)
|
||||
var AvazApprovedSites = make(chan string, 2)
|
||||
|
||||
func CrawlAvaz() {
|
||||
|
||||
crHomePage := colly.NewCollector(colly.AllowedDomains("avaz.ba"))
|
||||
crArticlePage := colly.NewCollector(colly.AllowedDomains("avaz.ba"))
|
||||
|
||||
setupAvazArticlePageCrawler(crArticlePage)
|
||||
setupAvazHomepageCrawler(crHomePage, crArticlePage)
|
||||
|
||||
go visitAvazApprovedPages(crArticlePage)
|
||||
}
|
||||
|
||||
func visitAvazApprovedPages(crArticlePage *colly.Collector) {
|
||||
fmt.Println("Consuming sites!")
|
||||
for url := range AvazApprovedSites {
|
||||
fmt.Println("Visiting: ", url)
|
||||
crArticlePage.Visit(url)
|
||||
}
|
||||
}
|
||||
|
||||
func setupAvazHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
|
||||
|
||||
articleUrlR, _ := regexp.Compile("/\\d\\d+/([a-z0-9-]+)")
|
||||
articleBlacklist, _ := regexp.Compile("(english)")
|
||||
crHomePage.OnHTML("a", func(e *colly.HTMLElement) {
|
||||
url := e.Attr("href")
|
||||
completeUrl := url
|
||||
if articleUrlR.MatchString(url) && !articleBlacklist.MatchString(url) {
|
||||
AvazCandidates <- completeUrl
|
||||
}
|
||||
})
|
||||
|
||||
crHomePage.OnScraped(func(_ *colly.Response) {
|
||||
time.Sleep(5 * time.Second)
|
||||
terminating := model.ScrapedArticle{}
|
||||
terminating.Title = model.Terminator
|
||||
AvazArticles <- terminating
|
||||
AvazApprovedSites <- model.Terminator
|
||||
AvazCandidates <- model.Terminator
|
||||
})
|
||||
|
||||
crHomePage.OnError(func(_ *colly.Response, _ error) {
|
||||
time.Sleep(5 * time.Second)
|
||||
terminating := model.ScrapedArticle{}
|
||||
terminating.Title = model.Terminator
|
||||
AvazArticles <- terminating
|
||||
AvazApprovedSites <- model.Terminator
|
||||
AvazCandidates <- model.Terminator
|
||||
})
|
||||
|
||||
go crHomePage.Visit("https://avaz.ba")
|
||||
}
|
||||
|
||||
func setupAvazArticlePageCrawler(crArticlePage *colly.Collector) {
|
||||
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
|
||||
url := e.Request.URL.String()
|
||||
|
||||
title := ""
|
||||
e.ForEachWithBreak("h1.title, h3.title", func(_ int, el *colly.HTMLElement) bool {
|
||||
title = el.Text
|
||||
return false
|
||||
})
|
||||
|
||||
text := ""
|
||||
|
||||
e.ForEach("p.podtitle, div.artikal-text", func(_ int, el *colly.HTMLElement) {
|
||||
text += extractJustText(el.DOM)
|
||||
})
|
||||
|
||||
article := model.ScrapedArticle{}
|
||||
|
||||
trimmedText := strings.TrimSpace(text)
|
||||
article.OriginalUrl = url
|
||||
article.Title = title
|
||||
article.Content = trimmedText
|
||||
article.SourceId = model.AvazSource
|
||||
slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title)
|
||||
article.Slug = slug.Make(slugBase)
|
||||
|
||||
AvazArticles <- article
|
||||
})
|
||||
|
||||
crArticlePage.OnError(func(_ *colly.Response, err error) {
|
||||
fmt.Println("Problem crawling!", err)
|
||||
})
|
||||
|
||||
}
|
||||
104
internal/scraper/blijesak.go
Normal file
104
internal/scraper/blijesak.go
Normal file
@@ -0,0 +1,104 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/gocolly/colly"
|
||||
"github.com/gosimple/slug"
|
||||
"gitlab.com/kbr4/svevijesti/internal/model"
|
||||
"math/rand"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var BljesakArticles = make(chan model.ScrapedArticle)
|
||||
var BljesakCandidates = make(chan string)
|
||||
var BljesakApprovedSites = make(chan string, 2)
|
||||
|
||||
func CrawlBljesak() {
|
||||
|
||||
crHomePage := colly.NewCollector(colly.AllowedDomains("bljesak.info"))
|
||||
crArticlePage := colly.NewCollector(colly.AllowedDomains("bljesak.info"))
|
||||
|
||||
setupBljesakArticlePageCrawler(crArticlePage)
|
||||
setupBljesakHomepageCrawler(crHomePage, crArticlePage)
|
||||
|
||||
go visitBljesakApprovedPages(crArticlePage)
|
||||
}
|
||||
|
||||
func visitBljesakApprovedPages(crArticlePage *colly.Collector) {
|
||||
fmt.Println("Consuming sites!")
|
||||
for url := range BljesakApprovedSites {
|
||||
fmt.Println("Visiting: ", url)
|
||||
crArticlePage.Visit(url)
|
||||
}
|
||||
}
|
||||
|
||||
func setupBljesakHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
|
||||
|
||||
articleUrlR, _ := regexp.Compile("\\d\\d+$")
|
||||
articleBlacklist, _ := regexp.Compile("(info-vodic|foto-data)")
|
||||
crHomePage.OnHTML("a", func(e *colly.HTMLElement) {
|
||||
url := e.Attr("href")
|
||||
completeUrl := url
|
||||
if articleUrlR.MatchString(url) && !articleBlacklist.MatchString(url) {
|
||||
BljesakCandidates <- completeUrl
|
||||
}
|
||||
})
|
||||
|
||||
crHomePage.OnScraped(func(_ *colly.Response) {
|
||||
time.Sleep(5 * time.Second)
|
||||
terminating := model.ScrapedArticle{}
|
||||
terminating.Title = model.Terminator
|
||||
BljesakArticles <- terminating
|
||||
BljesakApprovedSites <- model.Terminator
|
||||
BljesakCandidates <- model.Terminator
|
||||
})
|
||||
|
||||
crHomePage.OnError(func(_ *colly.Response, _ error) {
|
||||
time.Sleep(5 * time.Second)
|
||||
terminating := model.ScrapedArticle{}
|
||||
terminating.Title = model.Terminator
|
||||
BljesakArticles <- terminating
|
||||
BljesakApprovedSites <- model.Terminator
|
||||
BljesakCandidates <- model.Terminator
|
||||
})
|
||||
|
||||
go crHomePage.Visit("https://bljesak.info")
|
||||
}
|
||||
|
||||
func setupBljesakArticlePageCrawler(crArticlePage *colly.Collector) {
|
||||
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
|
||||
url := e.Request.URL.String()
|
||||
|
||||
title := ""
|
||||
e.ForEachWithBreak("h1.title, h3.title", func(_ int, el *colly.HTMLElement) bool {
|
||||
title = el.Text
|
||||
return false
|
||||
})
|
||||
|
||||
text := ""
|
||||
|
||||
e.ForEach("div.intro, div#infiniteLoadBreakpoint", func(_ int, el *colly.HTMLElement) {
|
||||
text += extractJustText(el.DOM)
|
||||
})
|
||||
|
||||
article := model.ScrapedArticle{}
|
||||
|
||||
trimmedText := strings.TrimSpace(text)
|
||||
article.OriginalUrl = url
|
||||
article.Title = title
|
||||
article.Content = trimmedText
|
||||
article.SourceId = model.BljesakSource
|
||||
slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title)
|
||||
article.Slug = slug.Make(slugBase)
|
||||
|
||||
BljesakArticles <- article
|
||||
})
|
||||
|
||||
crArticlePage.OnError(func(_ *colly.Response, err error) {
|
||||
fmt.Println("Problem crawling!", err)
|
||||
})
|
||||
|
||||
}
|
||||
103
internal/scraper/klix.go
Normal file
103
internal/scraper/klix.go
Normal file
@@ -0,0 +1,103 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/gocolly/colly"
|
||||
"github.com/gosimple/slug"
|
||||
"gitlab.com/kbr4/svevijesti/internal/model"
|
||||
"math/rand"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var KlixArticles = make(chan model.ScrapedArticle)
|
||||
var KlixCandidates = make(chan string)
|
||||
var KlixApprovedSites = make(chan string, 2)
|
||||
|
||||
func CrawlKlix() {
|
||||
|
||||
crHomePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
|
||||
crArticlePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
|
||||
|
||||
setupKlArticlePageCrawler(crArticlePage)
|
||||
setupKlHomepageCrawler(crHomePage, crArticlePage)
|
||||
|
||||
go visitApprovedPages(crArticlePage)
|
||||
}
|
||||
|
||||
func visitApprovedPages(crArticlePage *colly.Collector) {
|
||||
fmt.Println("Consuming sites!")
|
||||
for url := range KlixApprovedSites {
|
||||
fmt.Println("Visiting: ", url)
|
||||
crArticlePage.Visit(url)
|
||||
}
|
||||
}
|
||||
|
||||
func setupKlHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
|
||||
|
||||
articleUrlR, _ := regexp.Compile("\\d\\d+$")
|
||||
crHomePage.OnHTML(".container a", func(e *colly.HTMLElement) {
|
||||
url := e.Attr("href")
|
||||
completeUrl := "https://www.klix.ba" + url
|
||||
if articleUrlR.MatchString(url) {
|
||||
KlixCandidates <- completeUrl
|
||||
}
|
||||
})
|
||||
|
||||
crHomePage.OnScraped(func(_ *colly.Response) {
|
||||
time.Sleep(5 * time.Second)
|
||||
terminating := model.ScrapedArticle{}
|
||||
terminating.Title = model.Terminator
|
||||
KlixArticles <- terminating
|
||||
KlixApprovedSites <- model.Terminator
|
||||
KlixCandidates <- model.Terminator
|
||||
})
|
||||
|
||||
crHomePage.OnError(func(_ *colly.Response, _ error) {
|
||||
time.Sleep(5 * time.Second)
|
||||
terminating := model.ScrapedArticle{}
|
||||
terminating.Title = model.Terminator
|
||||
KlixArticles <- terminating
|
||||
KlixApprovedSites <- model.Terminator
|
||||
KlixCandidates <- model.Terminator
|
||||
})
|
||||
|
||||
go crHomePage.Visit("https://www.klix.ba")
|
||||
}
|
||||
|
||||
func setupKlArticlePageCrawler(crArticlePage *colly.Collector) {
|
||||
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
|
||||
url := e.Request.URL.String()
|
||||
|
||||
title := ""
|
||||
e.ForEachWithBreak("title", func(_ int, el *colly.HTMLElement) bool {
|
||||
title = el.Text
|
||||
return false
|
||||
})
|
||||
|
||||
text := ""
|
||||
|
||||
e.ForEach("div#text, p.lead", func(_ int, el *colly.HTMLElement) {
|
||||
text += extractJustText(el.DOM)
|
||||
})
|
||||
|
||||
article := model.ScrapedArticle{}
|
||||
|
||||
trimmedText := strings.TrimSpace(text)
|
||||
article.OriginalUrl = url
|
||||
article.Title = title
|
||||
article.Content = trimmedText
|
||||
article.SourceId = model.KlixSource
|
||||
slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title)
|
||||
article.Slug = slug.Make(slugBase)
|
||||
|
||||
KlixArticles <- article
|
||||
})
|
||||
|
||||
crArticlePage.OnError(func(_ *colly.Response, _ error) {
|
||||
fmt.Println("Problem crawling!")
|
||||
})
|
||||
|
||||
}
|
||||
26
internal/scraper/scraper.go
Normal file
26
internal/scraper/scraper.go
Normal file
@@ -0,0 +1,26 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
func extractJustText(el *goquery.Selection) string {
|
||||
textPart := ""
|
||||
htmlPart, _ := el.Html()
|
||||
if len(el.Nodes) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
//fmt.Println("Checking: ", htmlPart, "Duzina: ", strconv.Itoa(len(el.Nodes)), " Type je ", el.Nodes[0].Type, " jednakost ", el.Text() == htmlPart)
|
||||
if el.Text() == htmlPart {
|
||||
return el.Text() + "\n"
|
||||
}
|
||||
|
||||
el.Children().Each(func(_ int, el2 *goquery.Selection) {
|
||||
if el2.Is("div, p, span, a") {
|
||||
textPart += extractJustText(el2)
|
||||
}
|
||||
})
|
||||
|
||||
return textPart
|
||||
}
|
||||
103
internal/scraper/srpskainfo.go
Normal file
103
internal/scraper/srpskainfo.go
Normal file
@@ -0,0 +1,103 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/gocolly/colly"
|
||||
"github.com/gosimple/slug"
|
||||
"gitlab.com/kbr4/svevijesti/internal/model"
|
||||
"math/rand"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var SrpskainfoArticles = make(chan model.ScrapedArticle)
|
||||
var SrpskainfoCandidates = make(chan string)
|
||||
var SrpskainfoApprovedSites = make(chan string, 2)
|
||||
|
||||
func CrawlSrpskainfo() {
|
||||
|
||||
crHomePage := colly.NewCollector(colly.AllowedDomains("srpskainfo.com"))
|
||||
crArticlePage := colly.NewCollector(colly.AllowedDomains("srpskainfo.com"))
|
||||
|
||||
setupSiArticlePageCrawler(crArticlePage)
|
||||
setupSiHomepageCrawler(crHomePage, crArticlePage)
|
||||
|
||||
go visitSiApprovedPages(crArticlePage)
|
||||
}
|
||||
|
||||
func visitSiApprovedPages(crArticlePage *colly.Collector) {
|
||||
fmt.Println("Consuming sites!")
|
||||
for url := range SrpskainfoApprovedSites {
|
||||
fmt.Println("Visiting: ", url)
|
||||
crArticlePage.Visit(url)
|
||||
}
|
||||
}
|
||||
|
||||
func setupSiHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
|
||||
|
||||
crHomePage.OnHTML("a", func(e *colly.HTMLElement) {
|
||||
articleUrlR, _ := regexp.Compile("([A-Za-z0-9]+-){3,}([A-Za-z0-9]+)/$")
|
||||
url := e.Attr("href")
|
||||
completeUrl := url
|
||||
if articleUrlR.MatchString(url) {
|
||||
SrpskainfoCandidates <- completeUrl
|
||||
}
|
||||
})
|
||||
|
||||
crHomePage.OnScraped(func(_ *colly.Response) {
|
||||
time.Sleep(5 * time.Second)
|
||||
terminating := model.ScrapedArticle{}
|
||||
terminating.Title = model.Terminator
|
||||
SrpskainfoArticles <- terminating
|
||||
SrpskainfoApprovedSites <- model.Terminator
|
||||
SrpskainfoCandidates <- model.Terminator
|
||||
})
|
||||
|
||||
crHomePage.OnError(func(_ *colly.Response, _ error) {
|
||||
time.Sleep(5 * time.Second)
|
||||
terminating := model.ScrapedArticle{}
|
||||
terminating.Title = model.Terminator
|
||||
SrpskainfoArticles <- terminating
|
||||
SrpskainfoApprovedSites <- model.Terminator
|
||||
SrpskainfoCandidates <- model.Terminator
|
||||
})
|
||||
|
||||
go crHomePage.Visit("https://srpskainfo.com")
|
||||
}
|
||||
|
||||
func setupSiArticlePageCrawler(crArticlePage *colly.Collector) {
|
||||
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
|
||||
url := e.Request.URL.String()
|
||||
|
||||
title := ""
|
||||
e.ForEachWithBreak("h1", func(_ int, el *colly.HTMLElement) bool {
|
||||
title = el.Text
|
||||
return false
|
||||
})
|
||||
|
||||
text := ""
|
||||
|
||||
e.ForEach("div.article__top-content, div.article__content", func(_ int, el *colly.HTMLElement) {
|
||||
text += extractJustText(el.DOM)
|
||||
})
|
||||
|
||||
article := model.ScrapedArticle{}
|
||||
|
||||
trimmedText := strings.TrimSpace(text)
|
||||
article.OriginalUrl = url
|
||||
article.Title = title
|
||||
article.Content = trimmedText
|
||||
article.SourceId = model.SrpskainfoSource
|
||||
slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title)
|
||||
article.Slug = slug.Make(slugBase)
|
||||
|
||||
SrpskainfoArticles <- article
|
||||
})
|
||||
|
||||
crArticlePage.OnError(func(_ *colly.Response, _ error) {
|
||||
fmt.Println("Problem crawling!")
|
||||
})
|
||||
|
||||
}
|
||||
110
internal/server/articles.go
Normal file
110
internal/server/articles.go
Normal file
@@ -0,0 +1,110 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/gorilla/mux"
|
||||
"gitlab.com/kbr4/svevijesti/internal/database"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"time"
|
||||
)
|
||||
|
||||
func rootHandler(wr http.ResponseWriter, req *http.Request) {
|
||||
title := "Pocetna"
|
||||
store, err := database.Connect()
|
||||
if err != nil {
|
||||
http.Error(wr, err.Error(), http.StatusInternalServerError)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
articles, err := database.ArticlesForDay(store, time.Now())
|
||||
if err != nil {
|
||||
http.Error(wr, err.Error(), http.StatusInternalServerError)
|
||||
}
|
||||
|
||||
dayBefore := "/dan/" + time.Now().Add(-24*time.Hour).Format("2006-01-02")
|
||||
|
||||
data := map[string]interface{}{
|
||||
"title": title,
|
||||
"articles": articles,
|
||||
"previous": dayBefore,
|
||||
"next": "/",
|
||||
}
|
||||
|
||||
err = templates.ExecuteTemplate(wr, "homeHTML", data)
|
||||
if err != nil {
|
||||
http.Error(wr, err.Error(), http.StatusInternalServerError)
|
||||
}
|
||||
}
|
||||
|
||||
func dailyArticlesHandler(wr http.ResponseWriter, req *http.Request) {
|
||||
vars := mux.Vars(req)
|
||||
day, err := time.Parse("2006-01-02", vars["date"])
|
||||
if err != nil {
|
||||
http.Error(wr, err.Error(), http.StatusNotFound)
|
||||
}
|
||||
dayBefore := "/dan/" + day.Add(-24*time.Hour).Format("2006-01-02")
|
||||
dayAfter := "/dan/" + day.Add(24*time.Hour).Format("2006-01-02")
|
||||
|
||||
if day.Add(24*time.Hour).Format("2006-01-02") == time.Now().Format("2006-01-02") {
|
||||
dayAfter = "/"
|
||||
}
|
||||
|
||||
title := fmt.Sprintf("Stare novine na dan %s", day.Format("2006-01-02"))
|
||||
store, err := database.Connect()
|
||||
if err != nil {
|
||||
http.Error(wr, err.Error(), http.StatusInternalServerError)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
articles, err := database.ArticlesForDay(store, day)
|
||||
if err != nil {
|
||||
http.Error(wr, err.Error(), http.StatusInternalServerError)
|
||||
}
|
||||
|
||||
data := map[string]interface{}{
|
||||
"title": title,
|
||||
"articles": articles,
|
||||
"previous": dayBefore,
|
||||
"next": dayAfter,
|
||||
}
|
||||
|
||||
err = templates.ExecuteTemplate(wr, "homeHTML", data)
|
||||
if err != nil {
|
||||
http.Error(wr, err.Error(), http.StatusInternalServerError)
|
||||
}
|
||||
}
|
||||
|
||||
func articleHandler(wr http.ResponseWriter, req *http.Request) {
|
||||
store, err := database.Connect()
|
||||
if err != nil {
|
||||
http.Error(wr, err.Error(), http.StatusInternalServerError)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
vars := mux.Vars(req)
|
||||
articleID, err := strconv.Atoi(vars["id"])
|
||||
if err != nil {
|
||||
articleID = -1
|
||||
}
|
||||
articleSlug := vars["slug"]
|
||||
article, err := database.ArticleByID(store, articleID, articleSlug)
|
||||
if err != nil {
|
||||
http.Error(wr, err.Error(), http.StatusNotFound)
|
||||
}
|
||||
|
||||
next, previous, _ := database.PreviousAndNextArticleUrlByID(store, articleID)
|
||||
|
||||
title := article.Title
|
||||
data := map[string]interface{}{
|
||||
"title": title,
|
||||
"article": article,
|
||||
"previous": previous,
|
||||
"next": next,
|
||||
}
|
||||
|
||||
err = templates.ExecuteTemplate(wr, "articleHTML", data)
|
||||
if err != nil {
|
||||
http.Error(wr, err.Error(), http.StatusInternalServerError)
|
||||
}
|
||||
}
|
||||
46
internal/server/server.go
Normal file
46
internal/server/server.go
Normal file
@@ -0,0 +1,46 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/gorilla/mux"
|
||||
"html/template"
|
||||
"io/ioutil"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var tPath = "./web/tpl/"
|
||||
var dPath = "./web/data/"
|
||||
|
||||
var templateDirs = []string{"./web/tpl", "./web/data"}
|
||||
var templates *template.Template
|
||||
|
||||
func getTemplates() (templates *template.Template, err error) {
|
||||
var allFiles []string
|
||||
for _, dir := range templateDirs {
|
||||
files2, _ := ioutil.ReadDir(dir)
|
||||
for _, file := range files2 {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".html") {
|
||||
filePath := filepath.Join(dir, filename)
|
||||
fmt.Println("Template found: ", filePath)
|
||||
allFiles = append(allFiles, filePath)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
templates, err = template.New("").ParseFiles(allFiles...)
|
||||
return
|
||||
}
|
||||
|
||||
func init() {
|
||||
templates, _ = getTemplates()
|
||||
}
|
||||
|
||||
func CreateRoutes() *mux.Router {
|
||||
r := mux.NewRouter()
|
||||
r.HandleFunc("/dan/{date}", dailyArticlesHandler)
|
||||
r.HandleFunc("/{id:[0-9]+}/{slug}", articleHandler)
|
||||
r.HandleFunc("/", rootHandler)
|
||||
return r
|
||||
}
|
||||
Reference in New Issue
Block a user