package scraper import ( "fmt" "github.com/gocolly/colly" "github.com/gosimple/slug" "gitlab.com/kbr4/svevijesti/internal/model" "math/rand" "regexp" "strings" "time" ) var BljesakArticles = make(chan model.ScrapedArticle) var BljesakCandidates = make(chan string) var BljesakApprovedSites = make(chan string, 2) func CrawlBljesak() { crHomePage := colly.NewCollector(colly.AllowedDomains("bljesak.info")) crArticlePage := colly.NewCollector(colly.AllowedDomains("bljesak.info")) setupBljesakArticlePageCrawler(crArticlePage) setupBljesakHomepageCrawler(crHomePage, crArticlePage) go visitBljesakApprovedPages(crArticlePage) } func visitBljesakApprovedPages(crArticlePage *colly.Collector) { fmt.Println("Consuming sites!") for url := range BljesakApprovedSites { fmt.Println("Visiting: ", url) crArticlePage.Visit(url) } } func setupBljesakHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) { articleUrlR, _ := regexp.Compile("\\d\\d+$") articleBlacklist, _ := regexp.Compile("(info-vodic|foto-data)") crHomePage.OnHTML("a", func(e *colly.HTMLElement) { url := e.Attr("href") completeUrl := url if articleUrlR.MatchString(url) && !articleBlacklist.MatchString(url) { BljesakCandidates <- completeUrl } }) crHomePage.OnScraped(func(_ *colly.Response) { time.Sleep(5 * time.Second) terminating := model.ScrapedArticle{} terminating.Title = model.Terminator BljesakArticles <- terminating BljesakApprovedSites <- model.Terminator BljesakCandidates <- model.Terminator }) crHomePage.OnError(func(_ *colly.Response, _ error) { time.Sleep(5 * time.Second) terminating := model.ScrapedArticle{} terminating.Title = model.Terminator BljesakArticles <- terminating BljesakApprovedSites <- model.Terminator BljesakCandidates <- model.Terminator }) go crHomePage.Visit("https://bljesak.info") } func setupBljesakArticlePageCrawler(crArticlePage *colly.Collector) { crArticlePage.OnHTML("html", func(e *colly.HTMLElement) { url := e.Request.URL.String() title := "" e.ForEachWithBreak("h1.title, h3.title", func(_ int, el *colly.HTMLElement) bool { title = el.Text return false }) text := "" e.ForEach("div.intro, div#infiniteLoadBreakpoint", func(_ int, el *colly.HTMLElement) { text += extractJustText(el.DOM) }) article := model.ScrapedArticle{} trimmedText := strings.TrimSpace(text) article.OriginalUrl = url article.Title = title article.Content = trimmedText article.SourceId = model.BljesakSource slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title) article.Slug = slug.Make(slugBase) BljesakArticles <- article }) crArticlePage.OnError(func(_ *colly.Response, err error) { fmt.Println("Problem crawling!", err) }) }