Added srpskainfo crawler
This commit is contained in:
@@ -4,6 +4,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"github.com/lib/pq"
|
"github.com/lib/pq"
|
||||||
"gitlab.com/kbr4/svevijesti/internal/database"
|
"gitlab.com/kbr4/svevijesti/internal/database"
|
||||||
|
"gitlab.com/kbr4/svevijesti/internal/model"
|
||||||
"gitlab.com/kbr4/svevijesti/internal/scraper"
|
"gitlab.com/kbr4/svevijesti/internal/scraper"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -15,17 +16,37 @@ func main() {
|
|||||||
|
|
||||||
defer store.Close()
|
defer store.Close()
|
||||||
go candidateChecker()
|
go candidateChecker()
|
||||||
|
go scraper.CrawlSrpskainfo()
|
||||||
go scraper.CrawlKlix()
|
go scraper.CrawlKlix()
|
||||||
|
|
||||||
for article := range scraper.KlixArticles {
|
article := model.ScrapedArticle{}
|
||||||
fmt.Println("Saving ", article.OriginalUrl)
|
|
||||||
err = database.InsertArticle(store, article)
|
for {
|
||||||
if err, ok := err.(*pq.Error); ok {
|
select {
|
||||||
if err.Code.Name() != "unique_violation" {
|
case article = <-scraper.KlixArticles:
|
||||||
panic(err)
|
if article.Title == model.Terminator {
|
||||||
} else {
|
scraper.KlixArticles = nil
|
||||||
fmt.Println("Skipping: ", article.OriginalUrl)
|
|
||||||
}
|
}
|
||||||
|
case article = <-scraper.SrpskainfoArticles:
|
||||||
|
if article.Title == model.Terminator {
|
||||||
|
scraper.SrpskainfoArticles = nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if article.Title != model.Terminator {
|
||||||
|
fmt.Println("Saving ", article.OriginalUrl)
|
||||||
|
err = database.InsertArticle(store, article)
|
||||||
|
if err, ok := err.(*pq.Error); ok {
|
||||||
|
if err.Code.Name() != "unique_violation" {
|
||||||
|
panic(err)
|
||||||
|
} else {
|
||||||
|
fmt.Println("Skipping: ", article.OriginalUrl)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if scraper.KlixArticles == nil && scraper.SrpskainfoArticles == nil {
|
||||||
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -37,10 +58,31 @@ func candidateChecker() {
|
|||||||
}
|
}
|
||||||
defer store.Close()
|
defer store.Close()
|
||||||
|
|
||||||
for url := range scraper.KlixCandidates {
|
for {
|
||||||
if !database.IsSaved(store, url) {
|
select {
|
||||||
scraper.KlixApprovedSites <- url
|
case url := <-scraper.KlixCandidates:
|
||||||
|
if url == model.Terminator {
|
||||||
|
scraper.KlixCandidates = nil
|
||||||
|
} else {
|
||||||
|
if !database.IsSaved(store, url) {
|
||||||
|
scraper.KlixApprovedSites <- url
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
case url := <-scraper.SrpskainfoCandidates:
|
||||||
|
if url == model.Terminator {
|
||||||
|
scraper.SrpskainfoCandidates = nil
|
||||||
|
} else {
|
||||||
|
if !database.IsSaved(store, url) {
|
||||||
|
scraper.SrpskainfoApprovedSites <- url
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if scraper.KlixCandidates == nil && scraper.SrpskainfoCandidates == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,13 +26,20 @@ type DisplayArticle struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
KlixSource = 1
|
KlixSource = 1
|
||||||
|
SrpskainfoSource = 2
|
||||||
)
|
)
|
||||||
|
|
||||||
func SourceName(sourceId int) string {
|
func SourceName(sourceId int) string {
|
||||||
switch sourceId {
|
switch sourceId {
|
||||||
case KlixSource:
|
case KlixSource:
|
||||||
return "klix"
|
return "klix"
|
||||||
|
case SrpskainfoSource:
|
||||||
|
return "srpskainfo"
|
||||||
}
|
}
|
||||||
return "starenovine"
|
return "starenovine"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
Terminator = "TERMINATED"
|
||||||
|
)
|
||||||
|
|||||||
@@ -19,8 +19,8 @@ func CrawlKlix() {
|
|||||||
crHomePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
|
crHomePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
|
||||||
crArticlePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
|
crArticlePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
|
||||||
|
|
||||||
setupArticlePageCrawler(crArticlePage)
|
setupKlArticlePageCrawler(crArticlePage)
|
||||||
setupHomepageCrawler(crHomePage, crArticlePage)
|
setupKlHomepageCrawler(crHomePage, crArticlePage)
|
||||||
|
|
||||||
go visitApprovedPages(crArticlePage)
|
go visitApprovedPages(crArticlePage)
|
||||||
}
|
}
|
||||||
@@ -33,7 +33,7 @@ func visitApprovedPages(crArticlePage *colly.Collector) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func setupHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
|
func setupKlHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
|
||||||
|
|
||||||
articleUrlR, _ := regexp.Compile("\\d\\d+$")
|
articleUrlR, _ := regexp.Compile("\\d\\d+$")
|
||||||
crHomePage.OnHTML(".container a", func(e *colly.HTMLElement) {
|
crHomePage.OnHTML(".container a", func(e *colly.HTMLElement) {
|
||||||
@@ -46,21 +46,26 @@ func setupHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Coll
|
|||||||
|
|
||||||
crHomePage.OnScraped(func(_ *colly.Response) {
|
crHomePage.OnScraped(func(_ *colly.Response) {
|
||||||
time.Sleep(5 * time.Second)
|
time.Sleep(5 * time.Second)
|
||||||
close(KlixArticles)
|
terminating := model.ScrapedArticle{}
|
||||||
close(KlixApprovedSites)
|
terminating.Title = model.Terminator
|
||||||
close(KlixCandidates)
|
KlixArticles <- terminating
|
||||||
|
KlixApprovedSites <- model.Terminator
|
||||||
|
KlixCandidates <- model.Terminator
|
||||||
})
|
})
|
||||||
|
|
||||||
crHomePage.OnError(func(_ *colly.Response, _ error) {
|
crHomePage.OnError(func(_ *colly.Response, _ error) {
|
||||||
close(KlixArticles)
|
time.Sleep(5 * time.Second)
|
||||||
close(KlixApprovedSites)
|
terminating := model.ScrapedArticle{}
|
||||||
close(KlixCandidates)
|
terminating.Title = model.Terminator
|
||||||
|
KlixArticles <- terminating
|
||||||
|
KlixApprovedSites <- model.Terminator
|
||||||
|
KlixCandidates <- model.Terminator
|
||||||
})
|
})
|
||||||
|
|
||||||
go crHomePage.Visit("https://www.klix.ba")
|
go crHomePage.Visit("https://www.klix.ba")
|
||||||
}
|
}
|
||||||
|
|
||||||
func setupArticlePageCrawler(crArticlePage *colly.Collector) {
|
func setupKlArticlePageCrawler(crArticlePage *colly.Collector) {
|
||||||
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
|
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
|
||||||
|
|
||||||
url := e.Request.URL.String()
|
url := e.Request.URL.String()
|
||||||
|
|||||||
101
internal/scraper/srpskainfo.go
Normal file
101
internal/scraper/srpskainfo.go
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
package scraper
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"github.com/gocolly/colly"
|
||||||
|
"github.com/gosimple/slug"
|
||||||
|
"gitlab.com/kbr4/svevijesti/internal/model"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
var SrpskainfoArticles = make(chan model.ScrapedArticle)
|
||||||
|
var SrpskainfoCandidates = make(chan string)
|
||||||
|
var SrpskainfoApprovedSites = make(chan string, 2)
|
||||||
|
|
||||||
|
func CrawlSrpskainfo() {
|
||||||
|
|
||||||
|
crHomePage := colly.NewCollector(colly.AllowedDomains("srpskainfo.com"))
|
||||||
|
crArticlePage := colly.NewCollector(colly.AllowedDomains("srpskainfo.com"))
|
||||||
|
|
||||||
|
setupSiArticlePageCrawler(crArticlePage)
|
||||||
|
setupSiHomepageCrawler(crHomePage, crArticlePage)
|
||||||
|
|
||||||
|
go visitSiApprovedPages(crArticlePage)
|
||||||
|
}
|
||||||
|
|
||||||
|
func visitSiApprovedPages(crArticlePage *colly.Collector) {
|
||||||
|
fmt.Println("Consuming sites!")
|
||||||
|
for url := range SrpskainfoApprovedSites {
|
||||||
|
fmt.Println("Visiting: ", url)
|
||||||
|
crArticlePage.Visit(url)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func setupSiHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
|
||||||
|
|
||||||
|
crHomePage.OnHTML("a", func(e *colly.HTMLElement) {
|
||||||
|
articleUrlR, _ := regexp.Compile("([A-Za-z0-9]+-){3,}([A-Za-z0-9]+)/$")
|
||||||
|
url := e.Attr("href")
|
||||||
|
completeUrl := url
|
||||||
|
if articleUrlR.MatchString(url) {
|
||||||
|
SrpskainfoCandidates <- completeUrl
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
crHomePage.OnScraped(func(_ *colly.Response) {
|
||||||
|
time.Sleep(5 * time.Second)
|
||||||
|
terminating := model.ScrapedArticle{}
|
||||||
|
terminating.Title = model.Terminator
|
||||||
|
SrpskainfoArticles <- terminating
|
||||||
|
SrpskainfoApprovedSites <- model.Terminator
|
||||||
|
SrpskainfoCandidates <- model.Terminator
|
||||||
|
})
|
||||||
|
|
||||||
|
crHomePage.OnError(func(_ *colly.Response, _ error) {
|
||||||
|
time.Sleep(5 * time.Second)
|
||||||
|
terminating := model.ScrapedArticle{}
|
||||||
|
terminating.Title = model.Terminator
|
||||||
|
SrpskainfoArticles <- terminating
|
||||||
|
SrpskainfoApprovedSites <- model.Terminator
|
||||||
|
SrpskainfoCandidates <- model.Terminator
|
||||||
|
})
|
||||||
|
|
||||||
|
go crHomePage.Visit("https://srpskainfo.com")
|
||||||
|
}
|
||||||
|
|
||||||
|
func setupSiArticlePageCrawler(crArticlePage *colly.Collector) {
|
||||||
|
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
|
||||||
|
|
||||||
|
url := e.Request.URL.String()
|
||||||
|
|
||||||
|
title := ""
|
||||||
|
e.ForEachWithBreak("h1", func(_ int, el *colly.HTMLElement) bool {
|
||||||
|
title = el.Text
|
||||||
|
return false
|
||||||
|
})
|
||||||
|
|
||||||
|
text := ""
|
||||||
|
|
||||||
|
e.ForEach("p.article__top-content, p.article__content, h4.article__content, h3.article__content, h2.article__content, div.article__content", func(_ int, el *colly.HTMLElement) {
|
||||||
|
text += extractJustText(el.DOM)
|
||||||
|
})
|
||||||
|
|
||||||
|
article := model.ScrapedArticle{}
|
||||||
|
|
||||||
|
trimmedText := strings.TrimSpace(text)
|
||||||
|
article.OriginalUrl = url
|
||||||
|
article.Title = title
|
||||||
|
article.Content = trimmedText
|
||||||
|
article.SourceId = model.SrpskainfoSource
|
||||||
|
article.Slug = slug.Make(title)
|
||||||
|
|
||||||
|
SrpskainfoArticles <- article
|
||||||
|
})
|
||||||
|
|
||||||
|
crArticlePage.OnError(func(_ *colly.Response, _ error) {
|
||||||
|
fmt.Println("Problem crawling!")
|
||||||
|
})
|
||||||
|
|
||||||
|
}
|
||||||
@@ -3,4 +3,5 @@
|
|||||||
sudo systemctl stop starenovine
|
sudo systemctl stop starenovine
|
||||||
sudo cp ./server /opt/starenovine/server
|
sudo cp ./server /opt/starenovine/server
|
||||||
sudo cp -R ./web /opt/starenovine/
|
sudo cp -R ./web /opt/starenovine/
|
||||||
|
sudo cp ./spider /opt/starenovine/spider
|
||||||
sudo systemctl start starenovine
|
sudo systemctl start starenovine
|
||||||
|
|||||||
Reference in New Issue
Block a user