package scraper import ( "fmt" "github.com/gocolly/colly" "github.com/gosimple/slug" "gitlab.com/kbr4/svevijesti/internal/model" "regexp" "strings" "time" ) var KlixArticles = make(chan model.ScrapedArticle) var KlixCandidates = make(chan string) var KlixApprovedSites = make(chan string, 2) func CrawlKlix() { crHomePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba")) crArticlePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba")) setupArticlePageCrawler(crArticlePage) setupHomepageCrawler(crHomePage, crArticlePage) go visitApprovedPages(crArticlePage) } func visitApprovedPages(crArticlePage *colly.Collector) { fmt.Println("Consuming sites!") for url := range KlixApprovedSites { fmt.Println("Visiting: ", url) crArticlePage.Visit(url) } } func setupHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) { articleUrlR, _ := regexp.Compile("\\d\\d+$") crHomePage.OnHTML(".container a", func(e *colly.HTMLElement) { url := e.Attr("href") completeUrl := "https://www.klix.ba" + url if articleUrlR.MatchString(url) { KlixCandidates <- completeUrl } }) crHomePage.OnScraped(func(_ *colly.Response) { time.Sleep(5 * time.Second) close(KlixArticles) close(KlixApprovedSites) close(KlixCandidates) }) crHomePage.OnError(func(_ *colly.Response, _ error) { close(KlixArticles) close(KlixApprovedSites) close(KlixCandidates) }) go crHomePage.Visit("https://www.klix.ba") } func setupArticlePageCrawler(crArticlePage *colly.Collector) { crArticlePage.OnHTML("html", func(e *colly.HTMLElement) { url := e.Request.URL.String() title := "" e.ForEachWithBreak("title", func(_ int, el *colly.HTMLElement) bool { title = el.Text return false }) text := "" e.ForEach("div#text, p.lead", func(_ int, el *colly.HTMLElement) { text += extractJustText(el.DOM) }) article := model.ScrapedArticle{} trimmedText := strings.TrimSpace(text) article.OriginalUrl = url article.Title = title article.Content = trimmedText article.SourceId = model.KlixSource article.Slug = slug.Make(title) KlixArticles <- article }) crArticlePage.OnError(func(_ *colly.Response, _ error) { fmt.Println("Problem crawling!") }) }