package scraper import ( "fmt" "github.com/gocolly/colly" "github.com/gosimple/slug" "gitlab.com/kbr4/svevijesti/internal/model" "math/rand" "regexp" "strings" "time" ) var AvazArticles = make(chan model.ScrapedArticle) var AvazCandidates = make(chan string) var AvazApprovedSites = make(chan string, 2) func CrawlAvaz() { crHomePage := colly.NewCollector(colly.AllowedDomains("avaz.ba")) crArticlePage := colly.NewCollector(colly.AllowedDomains("avaz.ba")) setupAvazArticlePageCrawler(crArticlePage) setupAvazHomepageCrawler(crHomePage, crArticlePage) go visitAvazApprovedPages(crArticlePage) } func visitAvazApprovedPages(crArticlePage *colly.Collector) { fmt.Println("Consuming sites!") for url := range AvazApprovedSites { fmt.Println("Visiting: ", url) crArticlePage.Visit(url) } } func setupAvazHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) { articleUrlR, _ := regexp.Compile("/\\d\\d+/([a-z0-9-]+)") articleBlacklist, _ := regexp.Compile("(english)") crHomePage.OnHTML("a", func(e *colly.HTMLElement) { url := e.Attr("href") completeUrl := url if articleUrlR.MatchString(url) && !articleBlacklist.MatchString(url) { AvazCandidates <- completeUrl } }) crHomePage.OnScraped(func(_ *colly.Response) { time.Sleep(5 * time.Second) terminating := model.ScrapedArticle{} terminating.Title = model.Terminator AvazArticles <- terminating AvazApprovedSites <- model.Terminator AvazCandidates <- model.Terminator }) crHomePage.OnError(func(_ *colly.Response, _ error) { time.Sleep(5 * time.Second) terminating := model.ScrapedArticle{} terminating.Title = model.Terminator AvazArticles <- terminating AvazApprovedSites <- model.Terminator AvazCandidates <- model.Terminator }) go crHomePage.Visit("https://avaz.ba") } func setupAvazArticlePageCrawler(crArticlePage *colly.Collector) { crArticlePage.OnHTML("html", func(e *colly.HTMLElement) { url := e.Request.URL.String() title := "" e.ForEachWithBreak("h1.title, h3.title", func(_ int, el *colly.HTMLElement) bool { title = el.Text return false }) text := "" e.ForEach("p.podtitle, div.artikal-text", func(_ int, el *colly.HTMLElement) { text += extractJustText(el.DOM) }) article := model.ScrapedArticle{} trimmedText := strings.TrimSpace(text) article.OriginalUrl = url article.Title = title article.Content = trimmedText article.SourceId = model.AvazSource slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title) article.Slug = slug.Make(slugBase) AvazArticles <- article }) crArticlePage.OnError(func(_ *colly.Response, err error) { fmt.Println("Problem crawling!", err) }) }