104 lines
2.5 KiB
Go
104 lines
2.5 KiB
Go
package scraper
|
|
|
|
import (
|
|
"fmt"
|
|
"github.com/gocolly/colly"
|
|
"github.com/gosimple/slug"
|
|
"gitlab.com/kbr4/svevijesti/internal/model"
|
|
"math/rand"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
var KlixArticles = make(chan model.ScrapedArticle)
|
|
var KlixCandidates = make(chan string)
|
|
var KlixApprovedSites = make(chan string, 2)
|
|
|
|
func CrawlKlix() {
|
|
|
|
crHomePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
|
|
crArticlePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
|
|
|
|
setupKlArticlePageCrawler(crArticlePage)
|
|
setupKlHomepageCrawler(crHomePage, crArticlePage)
|
|
|
|
go visitApprovedPages(crArticlePage)
|
|
}
|
|
|
|
func visitApprovedPages(crArticlePage *colly.Collector) {
|
|
fmt.Println("Consuming sites!")
|
|
for url := range KlixApprovedSites {
|
|
fmt.Println("Visiting: ", url)
|
|
crArticlePage.Visit(url)
|
|
}
|
|
}
|
|
|
|
func setupKlHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
|
|
|
|
articleUrlR, _ := regexp.Compile("\\d\\d+$")
|
|
crHomePage.OnHTML(".container a", func(e *colly.HTMLElement) {
|
|
url := e.Attr("href")
|
|
completeUrl := "https://www.klix.ba" + url
|
|
if articleUrlR.MatchString(url) {
|
|
KlixCandidates <- completeUrl
|
|
}
|
|
})
|
|
|
|
crHomePage.OnScraped(func(_ *colly.Response) {
|
|
time.Sleep(5 * time.Second)
|
|
terminating := model.ScrapedArticle{}
|
|
terminating.Title = model.Terminator
|
|
KlixArticles <- terminating
|
|
KlixApprovedSites <- model.Terminator
|
|
KlixCandidates <- model.Terminator
|
|
})
|
|
|
|
crHomePage.OnError(func(_ *colly.Response, _ error) {
|
|
time.Sleep(5 * time.Second)
|
|
terminating := model.ScrapedArticle{}
|
|
terminating.Title = model.Terminator
|
|
KlixArticles <- terminating
|
|
KlixApprovedSites <- model.Terminator
|
|
KlixCandidates <- model.Terminator
|
|
})
|
|
|
|
go crHomePage.Visit("https://www.klix.ba")
|
|
}
|
|
|
|
func setupKlArticlePageCrawler(crArticlePage *colly.Collector) {
|
|
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
|
|
|
|
url := e.Request.URL.String()
|
|
|
|
title := ""
|
|
e.ForEachWithBreak("title", func(_ int, el *colly.HTMLElement) bool {
|
|
title = el.Text
|
|
return false
|
|
})
|
|
|
|
text := ""
|
|
|
|
e.ForEach("div#text, p.lead", func(_ int, el *colly.HTMLElement) {
|
|
text += extractJustText(el.DOM)
|
|
})
|
|
|
|
article := model.ScrapedArticle{}
|
|
|
|
trimmedText := strings.TrimSpace(text)
|
|
article.OriginalUrl = url
|
|
article.Title = title
|
|
article.Content = trimmedText
|
|
article.SourceId = model.KlixSource
|
|
slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title)
|
|
article.Slug = slug.Make(slugBase)
|
|
|
|
KlixArticles <- article
|
|
})
|
|
|
|
crArticlePage.OnError(func(_ *colly.Response, _ error) {
|
|
fmt.Println("Problem crawling!")
|
|
})
|
|
|
|
}
|