package scraper import ( "fmt" "github.com/gocolly/colly" "github.com/gosimple/slug" "gitlab.com/kbr4/svevijesti/internal/model" "math/rand" "regexp" "strings" "time" ) var SrpskainfoArticles = make(chan model.ScrapedArticle) var SrpskainfoCandidates = make(chan string) var SrpskainfoApprovedSites = make(chan string, 2) func CrawlSrpskainfo() { crHomePage := colly.NewCollector(colly.AllowedDomains("srpskainfo.com")) crArticlePage := colly.NewCollector(colly.AllowedDomains("srpskainfo.com")) setupSiArticlePageCrawler(crArticlePage) setupSiHomepageCrawler(crHomePage, crArticlePage) go visitSiApprovedPages(crArticlePage) } func visitSiApprovedPages(crArticlePage *colly.Collector) { fmt.Println("Consuming sites!") for url := range SrpskainfoApprovedSites { fmt.Println("Visiting: ", url) crArticlePage.Visit(url) } } func setupSiHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) { crHomePage.OnHTML("a", func(e *colly.HTMLElement) { articleUrlR, _ := regexp.Compile("([A-Za-z0-9]+-){3,}([A-Za-z0-9]+)/$") url := e.Attr("href") completeUrl := url if articleUrlR.MatchString(url) { SrpskainfoCandidates <- completeUrl } }) crHomePage.OnScraped(func(_ *colly.Response) { time.Sleep(5 * time.Second) terminating := model.ScrapedArticle{} terminating.Title = model.Terminator SrpskainfoArticles <- terminating SrpskainfoApprovedSites <- model.Terminator SrpskainfoCandidates <- model.Terminator }) crHomePage.OnError(func(_ *colly.Response, _ error) { time.Sleep(5 * time.Second) terminating := model.ScrapedArticle{} terminating.Title = model.Terminator SrpskainfoArticles <- terminating SrpskainfoApprovedSites <- model.Terminator SrpskainfoCandidates <- model.Terminator }) go crHomePage.Visit("https://srpskainfo.com") } func setupSiArticlePageCrawler(crArticlePage *colly.Collector) { crArticlePage.OnHTML("html", func(e *colly.HTMLElement) { url := e.Request.URL.String() title := "" e.ForEachWithBreak("h1", func(_ int, el *colly.HTMLElement) bool { title = el.Text return false }) text := "" e.ForEach("div.article__top-content, div.article__content", func(_ int, el *colly.HTMLElement) { text += extractJustText(el.DOM) }) article := model.ScrapedArticle{} trimmedText := strings.TrimSpace(text) article.OriginalUrl = url article.Title = title article.Content = trimmedText article.SourceId = model.SrpskainfoSource slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title) article.Slug = slug.Make(slugBase) SrpskainfoArticles <- article }) crArticlePage.OnError(func(_ *colly.Response, _ error) { fmt.Println("Problem crawling!") }) }