Added srpskainfo crawler

This commit is contained in:
Senad Uka
2022-02-15 07:03:30 +01:00
parent 08d81be857
commit 6648f6754a
7 changed files with 178 additions and 22 deletions

View File

@@ -4,6 +4,7 @@ import (
"fmt" "fmt"
"github.com/lib/pq" "github.com/lib/pq"
"gitlab.com/kbr4/svevijesti/internal/database" "gitlab.com/kbr4/svevijesti/internal/database"
"gitlab.com/kbr4/svevijesti/internal/model"
"gitlab.com/kbr4/svevijesti/internal/scraper" "gitlab.com/kbr4/svevijesti/internal/scraper"
) )
@@ -15,17 +16,37 @@ func main() {
defer store.Close() defer store.Close()
go candidateChecker() go candidateChecker()
go scraper.CrawlSrpskainfo()
go scraper.CrawlKlix() go scraper.CrawlKlix()
for article := range scraper.KlixArticles { article := model.ScrapedArticle{}
fmt.Println("Saving ", article.OriginalUrl)
err = database.InsertArticle(store, article) for {
if err, ok := err.(*pq.Error); ok { select {
if err.Code.Name() != "unique_violation" { case article = <-scraper.KlixArticles:
panic(err) if article.Title == model.Terminator {
} else { scraper.KlixArticles = nil
fmt.Println("Skipping: ", article.OriginalUrl)
} }
case article = <-scraper.SrpskainfoArticles:
if article.Title == model.Terminator {
scraper.SrpskainfoArticles = nil
}
}
if article.Title != model.Terminator {
fmt.Println("Saving ", article.OriginalUrl)
err = database.InsertArticle(store, article)
if err, ok := err.(*pq.Error); ok {
if err.Code.Name() != "unique_violation" {
panic(err)
} else {
fmt.Println("Skipping: ", article.OriginalUrl)
}
}
}
if scraper.KlixArticles == nil && scraper.SrpskainfoArticles == nil {
break
} }
} }
} }
@@ -37,10 +58,31 @@ func candidateChecker() {
} }
defer store.Close() defer store.Close()
for url := range scraper.KlixCandidates { for {
if !database.IsSaved(store, url) { select {
scraper.KlixApprovedSites <- url case url := <-scraper.KlixCandidates:
if url == model.Terminator {
scraper.KlixCandidates = nil
} else {
if !database.IsSaved(store, url) {
scraper.KlixApprovedSites <- url
}
}
case url := <-scraper.SrpskainfoCandidates:
if url == model.Terminator {
scraper.SrpskainfoCandidates = nil
} else {
if !database.IsSaved(store, url) {
scraper.SrpskainfoApprovedSites <- url
}
}
} }
if scraper.KlixCandidates == nil && scraper.SrpskainfoCandidates == nil {
break
}
} }
} }

View File

@@ -26,13 +26,20 @@ type DisplayArticle struct {
} }
const ( const (
KlixSource = 1 KlixSource = 1
SrpskainfoSource = 2
) )
func SourceName(sourceId int) string { func SourceName(sourceId int) string {
switch sourceId { switch sourceId {
case KlixSource: case KlixSource:
return "klix" return "klix"
case SrpskainfoSource:
return "srpskainfo"
} }
return "starenovine" return "starenovine"
} }
const (
Terminator = "TERMINATED"
)

View File

@@ -19,8 +19,8 @@ func CrawlKlix() {
crHomePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba")) crHomePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
crArticlePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba")) crArticlePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
setupArticlePageCrawler(crArticlePage) setupKlArticlePageCrawler(crArticlePage)
setupHomepageCrawler(crHomePage, crArticlePage) setupKlHomepageCrawler(crHomePage, crArticlePage)
go visitApprovedPages(crArticlePage) go visitApprovedPages(crArticlePage)
} }
@@ -33,7 +33,7 @@ func visitApprovedPages(crArticlePage *colly.Collector) {
} }
} }
func setupHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) { func setupKlHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
articleUrlR, _ := regexp.Compile("\\d\\d+$") articleUrlR, _ := regexp.Compile("\\d\\d+$")
crHomePage.OnHTML(".container a", func(e *colly.HTMLElement) { crHomePage.OnHTML(".container a", func(e *colly.HTMLElement) {
@@ -46,21 +46,26 @@ func setupHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Coll
crHomePage.OnScraped(func(_ *colly.Response) { crHomePage.OnScraped(func(_ *colly.Response) {
time.Sleep(5 * time.Second) time.Sleep(5 * time.Second)
close(KlixArticles) terminating := model.ScrapedArticle{}
close(KlixApprovedSites) terminating.Title = model.Terminator
close(KlixCandidates) KlixArticles <- terminating
KlixApprovedSites <- model.Terminator
KlixCandidates <- model.Terminator
}) })
crHomePage.OnError(func(_ *colly.Response, _ error) { crHomePage.OnError(func(_ *colly.Response, _ error) {
close(KlixArticles) time.Sleep(5 * time.Second)
close(KlixApprovedSites) terminating := model.ScrapedArticle{}
close(KlixCandidates) terminating.Title = model.Terminator
KlixArticles <- terminating
KlixApprovedSites <- model.Terminator
KlixCandidates <- model.Terminator
}) })
go crHomePage.Visit("https://www.klix.ba") go crHomePage.Visit("https://www.klix.ba")
} }
func setupArticlePageCrawler(crArticlePage *colly.Collector) { func setupKlArticlePageCrawler(crArticlePage *colly.Collector) {
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) { crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
url := e.Request.URL.String() url := e.Request.URL.String()

View File

@@ -0,0 +1,101 @@
package scraper
import (
"fmt"
"github.com/gocolly/colly"
"github.com/gosimple/slug"
"gitlab.com/kbr4/svevijesti/internal/model"
"regexp"
"strings"
"time"
)
var SrpskainfoArticles = make(chan model.ScrapedArticle)
var SrpskainfoCandidates = make(chan string)
var SrpskainfoApprovedSites = make(chan string, 2)
func CrawlSrpskainfo() {
crHomePage := colly.NewCollector(colly.AllowedDomains("srpskainfo.com"))
crArticlePage := colly.NewCollector(colly.AllowedDomains("srpskainfo.com"))
setupSiArticlePageCrawler(crArticlePage)
setupSiHomepageCrawler(crHomePage, crArticlePage)
go visitSiApprovedPages(crArticlePage)
}
func visitSiApprovedPages(crArticlePage *colly.Collector) {
fmt.Println("Consuming sites!")
for url := range SrpskainfoApprovedSites {
fmt.Println("Visiting: ", url)
crArticlePage.Visit(url)
}
}
func setupSiHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
crHomePage.OnHTML("a", func(e *colly.HTMLElement) {
articleUrlR, _ := regexp.Compile("([A-Za-z0-9]+-){3,}([A-Za-z0-9]+)/$")
url := e.Attr("href")
completeUrl := url
if articleUrlR.MatchString(url) {
SrpskainfoCandidates <- completeUrl
}
})
crHomePage.OnScraped(func(_ *colly.Response) {
time.Sleep(5 * time.Second)
terminating := model.ScrapedArticle{}
terminating.Title = model.Terminator
SrpskainfoArticles <- terminating
SrpskainfoApprovedSites <- model.Terminator
SrpskainfoCandidates <- model.Terminator
})
crHomePage.OnError(func(_ *colly.Response, _ error) {
time.Sleep(5 * time.Second)
terminating := model.ScrapedArticle{}
terminating.Title = model.Terminator
SrpskainfoArticles <- terminating
SrpskainfoApprovedSites <- model.Terminator
SrpskainfoCandidates <- model.Terminator
})
go crHomePage.Visit("https://srpskainfo.com")
}
func setupSiArticlePageCrawler(crArticlePage *colly.Collector) {
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
url := e.Request.URL.String()
title := ""
e.ForEachWithBreak("h1", func(_ int, el *colly.HTMLElement) bool {
title = el.Text
return false
})
text := ""
e.ForEach("p.article__top-content, p.article__content, h4.article__content, h3.article__content, h2.article__content, div.article__content", func(_ int, el *colly.HTMLElement) {
text += extractJustText(el.DOM)
})
article := model.ScrapedArticle{}
trimmedText := strings.TrimSpace(text)
article.OriginalUrl = url
article.Title = title
article.Content = trimmedText
article.SourceId = model.SrpskainfoSource
article.Slug = slug.Make(title)
SrpskainfoArticles <- article
})
crArticlePage.OnError(func(_ *colly.Response, _ error) {
fmt.Println("Problem crawling!")
})
}

View File

@@ -3,4 +3,5 @@
sudo systemctl stop starenovine sudo systemctl stop starenovine
sudo cp ./server /opt/starenovine/server sudo cp ./server /opt/starenovine/server
sudo cp -R ./web /opt/starenovine/ sudo cp -R ./web /opt/starenovine/
sudo cp ./spider /opt/starenovine/spider
sudo systemctl start starenovine sudo systemctl start starenovine

BIN
server

Binary file not shown.

BIN
spider

Binary file not shown.