diff --git a/cmd/spider/spider.go b/cmd/spider/spider.go index 15066f5..4824487 100644 --- a/cmd/spider/spider.go +++ b/cmd/spider/spider.go @@ -19,6 +19,7 @@ func main() { go scraper.CrawlSrpskainfo() go scraper.CrawlKlix() go scraper.CrawlBljesak() + go scraper.CrawlAvaz() article := model.ScrapedArticle{} @@ -36,6 +37,10 @@ func main() { if article.Title == model.Terminator { scraper.BljesakArticles = nil } + case article = <-scraper.AvazArticles: + if article.Title == model.Terminator { + scraper.AvazArticles = nil + } } @@ -55,6 +60,7 @@ func main() { if scraper.KlixArticles == nil && scraper.SrpskainfoArticles == nil && + scraper.AvazArticles == nil && scraper.BljesakCandidates == nil { break } @@ -97,10 +103,20 @@ func candidateChecker() { } } + case url := <-scraper.AvazCandidates: + if url == model.Terminator { + scraper.AvazCandidates = nil + } else { + if !database.IsSaved(store, url) { + scraper.AvazApprovedSites <- url + } + } + } if scraper.KlixCandidates == nil && scraper.SrpskainfoCandidates == nil && + scraper.AvazCandidates == nil && scraper.BljesakCandidates == nil { break } diff --git a/internal/model/model.go b/internal/model/model.go index 1030bb8..c085bdf 100644 --- a/internal/model/model.go +++ b/internal/model/model.go @@ -29,6 +29,7 @@ const ( KlixSource = 1 SrpskainfoSource = 2 BljesakSource = 3 + AvazSource = 4 ) func SourceName(sourceId int) string { @@ -39,6 +40,8 @@ func SourceName(sourceId int) string { return "srpskainfo" case BljesakSource: return "bljesak" + case AvazSource: + return "avaz" } return "starenovine" } diff --git a/internal/scraper/avaz.go b/internal/scraper/avaz.go new file mode 100644 index 0000000..51f19f7 --- /dev/null +++ b/internal/scraper/avaz.go @@ -0,0 +1,104 @@ +package scraper + +import ( + "fmt" + "github.com/gocolly/colly" + "github.com/gosimple/slug" + "gitlab.com/kbr4/svevijesti/internal/model" + "math/rand" + "regexp" + "strings" + "time" +) + +var AvazArticles = make(chan model.ScrapedArticle) +var AvazCandidates = make(chan string) +var AvazApprovedSites = make(chan string, 2) + +func CrawlAvaz() { + + crHomePage := colly.NewCollector(colly.AllowedDomains("avaz.ba")) + crArticlePage := colly.NewCollector(colly.AllowedDomains("avaz.ba")) + + setupAvazArticlePageCrawler(crArticlePage) + setupAvazHomepageCrawler(crHomePage, crArticlePage) + + go visitAvazApprovedPages(crArticlePage) +} + +func visitAvazApprovedPages(crArticlePage *colly.Collector) { + fmt.Println("Consuming sites!") + for url := range AvazApprovedSites { + fmt.Println("Visiting: ", url) + crArticlePage.Visit(url) + } +} + +func setupAvazHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) { + + articleUrlR, _ := regexp.Compile("/\\d\\d+/([a-z0-9-]+)") + articleBlacklist, _ := regexp.Compile("(info-vodic|foto-data)") + crHomePage.OnHTML("a", func(e *colly.HTMLElement) { + url := e.Attr("href") + completeUrl := url + if articleUrlR.MatchString(url) && !articleBlacklist.MatchString(url) { + AvazCandidates <- completeUrl + } + }) + + crHomePage.OnScraped(func(_ *colly.Response) { + time.Sleep(5 * time.Second) + terminating := model.ScrapedArticle{} + terminating.Title = model.Terminator + AvazArticles <- terminating + AvazApprovedSites <- model.Terminator + AvazCandidates <- model.Terminator + }) + + crHomePage.OnError(func(_ *colly.Response, _ error) { + time.Sleep(5 * time.Second) + terminating := model.ScrapedArticle{} + terminating.Title = model.Terminator + AvazArticles <- terminating + AvazApprovedSites <- model.Terminator + AvazCandidates <- model.Terminator + }) + + go crHomePage.Visit("https://avaz.ba") +} + +func setupAvazArticlePageCrawler(crArticlePage *colly.Collector) { + crArticlePage.OnHTML("html", func(e *colly.HTMLElement) { + + url := e.Request.URL.String() + + title := "" + e.ForEachWithBreak("h1.title, h3.title", func(_ int, el *colly.HTMLElement) bool { + title = el.Text + return false + }) + + text := "" + + e.ForEach("p.podtitle, div.artikal-text", func(_ int, el *colly.HTMLElement) { + text += extractJustText(el.DOM) + }) + + article := model.ScrapedArticle{} + + trimmedText := strings.TrimSpace(text) + article.OriginalUrl = url + article.Title = title + article.Content = trimmedText + article.SourceId = model.AvazSource + slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title) + article.Slug = slug.Make(slugBase) + + AvazArticles <- article + }) + + crArticlePage.OnError(func(_ *colly.Response, err error) { + fmt.Println("Problem crawling!", err) + }) + +} diff --git a/scripts/install_server.sh b/scripts/install_server.sh index 71f1d3b..33807c2 100644 --- a/scripts/install_server.sh +++ b/scripts/install_server.sh @@ -3,5 +3,6 @@ sudo systemctl stop starenovine sudo cp ./server /opt/starenovine/server sudo cp -R ./web /opt/starenovine/ +sudo killall spider sudo cp ./spider /opt/starenovine/spider sudo systemctl start starenovine diff --git a/server b/server index fcd3ad9..df4e93d 100755 Binary files a/server and b/server differ diff --git a/spider b/spider index 367849d..8b778c0 100755 Binary files a/spider and b/spider differ