Files
old-svevijesti/internal/scraper/srpskainfo.go
2022-02-15 18:00:30 +01:00

104 lines
2.7 KiB
Go

package scraper
import (
"fmt"
"github.com/gocolly/colly"
"github.com/gosimple/slug"
"gitlab.com/kbr4/svevijesti/internal/model"
"math/rand"
"regexp"
"strings"
"time"
)
var SrpskainfoArticles = make(chan model.ScrapedArticle)
var SrpskainfoCandidates = make(chan string)
var SrpskainfoApprovedSites = make(chan string, 2)
func CrawlSrpskainfo() {
crHomePage := colly.NewCollector(colly.AllowedDomains("srpskainfo.com"))
crArticlePage := colly.NewCollector(colly.AllowedDomains("srpskainfo.com"))
setupSiArticlePageCrawler(crArticlePage)
setupSiHomepageCrawler(crHomePage, crArticlePage)
go visitSiApprovedPages(crArticlePage)
}
func visitSiApprovedPages(crArticlePage *colly.Collector) {
fmt.Println("Consuming sites!")
for url := range SrpskainfoApprovedSites {
fmt.Println("Visiting: ", url)
crArticlePage.Visit(url)
}
}
func setupSiHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
crHomePage.OnHTML("a", func(e *colly.HTMLElement) {
articleUrlR, _ := regexp.Compile("([A-Za-z0-9]+-){3,}([A-Za-z0-9]+)/$")
url := e.Attr("href")
completeUrl := url
if articleUrlR.MatchString(url) {
SrpskainfoCandidates <- completeUrl
}
})
crHomePage.OnScraped(func(_ *colly.Response) {
time.Sleep(5 * time.Second)
terminating := model.ScrapedArticle{}
terminating.Title = model.Terminator
SrpskainfoArticles <- terminating
SrpskainfoApprovedSites <- model.Terminator
SrpskainfoCandidates <- model.Terminator
})
crHomePage.OnError(func(_ *colly.Response, _ error) {
time.Sleep(5 * time.Second)
terminating := model.ScrapedArticle{}
terminating.Title = model.Terminator
SrpskainfoArticles <- terminating
SrpskainfoApprovedSites <- model.Terminator
SrpskainfoCandidates <- model.Terminator
})
go crHomePage.Visit("https://srpskainfo.com")
}
func setupSiArticlePageCrawler(crArticlePage *colly.Collector) {
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
url := e.Request.URL.String()
title := ""
e.ForEachWithBreak("h1", func(_ int, el *colly.HTMLElement) bool {
title = el.Text
return false
})
text := ""
e.ForEach("p.article__top-content, p.article__content, h4.article__content, h3.article__content, h2.article__content, div.article__content", func(_ int, el *colly.HTMLElement) {
text += extractJustText(el.DOM)
})
article := model.ScrapedArticle{}
trimmedText := strings.TrimSpace(text)
article.OriginalUrl = url
article.Title = title
article.Content = trimmedText
article.SourceId = model.SrpskainfoSource
slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title)
article.Slug = slug.Make(slugBase)
SrpskainfoArticles <- article
})
crArticlePage.OnError(func(_ *colly.Response, _ error) {
fmt.Println("Problem crawling!")
})
}