package scraper import ( "fmt" "github.com/gocolly/colly" "github.com/gosimple/slug" "gitlab.com/kbr4/svevijesti/internal/model" "math/rand" "regexp" "strings" "time" ) var KlixArticles = make(chan model.ScrapedArticle) var KlixCandidates = make(chan string) var KlixApprovedSites = make(chan string, 2) func CrawlKlix() { crHomePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba")) crArticlePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba")) setupKlArticlePageCrawler(crArticlePage) setupKlHomepageCrawler(crHomePage, crArticlePage) go visitApprovedPages(crArticlePage) } func visitApprovedPages(crArticlePage *colly.Collector) { fmt.Println("Consuming sites!") for url := range KlixApprovedSites { fmt.Println("Visiting: ", url) crArticlePage.Visit(url) } } func setupKlHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) { articleUrlR, _ := regexp.Compile("\\d\\d+$") crHomePage.OnHTML(".container a", func(e *colly.HTMLElement) { url := e.Attr("href") completeUrl := "https://www.klix.ba" + url if articleUrlR.MatchString(url) { KlixCandidates <- completeUrl } }) crHomePage.OnScraped(func(_ *colly.Response) { time.Sleep(5 * time.Second) terminating := model.ScrapedArticle{} terminating.Title = model.Terminator KlixArticles <- terminating KlixApprovedSites <- model.Terminator KlixCandidates <- model.Terminator }) crHomePage.OnError(func(_ *colly.Response, _ error) { time.Sleep(5 * time.Second) terminating := model.ScrapedArticle{} terminating.Title = model.Terminator KlixArticles <- terminating KlixApprovedSites <- model.Terminator KlixCandidates <- model.Terminator }) go crHomePage.Visit("https://www.klix.ba") } func setupKlArticlePageCrawler(crArticlePage *colly.Collector) { crArticlePage.OnHTML("html", func(e *colly.HTMLElement) { url := e.Request.URL.String() title := "" e.ForEachWithBreak("title", func(_ int, el *colly.HTMLElement) bool { title = el.Text return false }) text := "" e.ForEach("div#text, p.lead", func(_ int, el *colly.HTMLElement) { text += extractJustText(el.DOM) }) article := model.ScrapedArticle{} trimmedText := strings.TrimSpace(text) article.OriginalUrl = url article.Title = title article.Content = trimmedText article.SourceId = model.KlixSource slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title) article.Slug = slug.Make(slugBase) KlixArticles <- article }) crArticlePage.OnError(func(_ *colly.Response, _ error) { fmt.Println("Problem crawling!") }) }