diff --git a/cmd/spider/spider.go b/cmd/spider/spider.go index bbec093..c133362 100644 --- a/cmd/spider/spider.go +++ b/cmd/spider/spider.go @@ -4,6 +4,7 @@ import ( "fmt" "github.com/lib/pq" "gitlab.com/kbr4/svevijesti/internal/database" + "gitlab.com/kbr4/svevijesti/internal/model" "gitlab.com/kbr4/svevijesti/internal/scraper" ) @@ -15,17 +16,37 @@ func main() { defer store.Close() go candidateChecker() + go scraper.CrawlSrpskainfo() go scraper.CrawlKlix() - for article := range scraper.KlixArticles { - fmt.Println("Saving ", article.OriginalUrl) - err = database.InsertArticle(store, article) - if err, ok := err.(*pq.Error); ok { - if err.Code.Name() != "unique_violation" { - panic(err) - } else { - fmt.Println("Skipping: ", article.OriginalUrl) + article := model.ScrapedArticle{} + + for { + select { + case article = <-scraper.KlixArticles: + if article.Title == model.Terminator { + scraper.KlixArticles = nil } + case article = <-scraper.SrpskainfoArticles: + if article.Title == model.Terminator { + scraper.SrpskainfoArticles = nil + } + } + + if article.Title != model.Terminator { + fmt.Println("Saving ", article.OriginalUrl) + err = database.InsertArticle(store, article) + if err, ok := err.(*pq.Error); ok { + if err.Code.Name() != "unique_violation" { + panic(err) + } else { + fmt.Println("Skipping: ", article.OriginalUrl) + } + } + } + + if scraper.KlixArticles == nil && scraper.SrpskainfoArticles == nil { + break } } } @@ -37,10 +58,31 @@ func candidateChecker() { } defer store.Close() - for url := range scraper.KlixCandidates { - if !database.IsSaved(store, url) { - scraper.KlixApprovedSites <- url + for { + select { + case url := <-scraper.KlixCandidates: + if url == model.Terminator { + scraper.KlixCandidates = nil + } else { + if !database.IsSaved(store, url) { + scraper.KlixApprovedSites <- url + } + } + + case url := <-scraper.SrpskainfoCandidates: + if url == model.Terminator { + scraper.SrpskainfoCandidates = nil + } else { + if !database.IsSaved(store, url) { + scraper.SrpskainfoApprovedSites <- url + } + } } + + if scraper.KlixCandidates == nil && scraper.SrpskainfoCandidates == nil { + break + } + } } diff --git a/internal/model/model.go b/internal/model/model.go index d9211fb..f51525b 100644 --- a/internal/model/model.go +++ b/internal/model/model.go @@ -26,13 +26,20 @@ type DisplayArticle struct { } const ( - KlixSource = 1 + KlixSource = 1 + SrpskainfoSource = 2 ) func SourceName(sourceId int) string { switch sourceId { case KlixSource: return "klix" + case SrpskainfoSource: + return "srpskainfo" } return "starenovine" } + +const ( + Terminator = "TERMINATED" +) diff --git a/internal/scraper/klix.go b/internal/scraper/klix.go index 5674de7..636996f 100644 --- a/internal/scraper/klix.go +++ b/internal/scraper/klix.go @@ -19,8 +19,8 @@ func CrawlKlix() { crHomePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba")) crArticlePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba")) - setupArticlePageCrawler(crArticlePage) - setupHomepageCrawler(crHomePage, crArticlePage) + setupKlArticlePageCrawler(crArticlePage) + setupKlHomepageCrawler(crHomePage, crArticlePage) go visitApprovedPages(crArticlePage) } @@ -33,7 +33,7 @@ func visitApprovedPages(crArticlePage *colly.Collector) { } } -func setupHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) { +func setupKlHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) { articleUrlR, _ := regexp.Compile("\\d\\d+$") crHomePage.OnHTML(".container a", func(e *colly.HTMLElement) { @@ -46,21 +46,26 @@ func setupHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Coll crHomePage.OnScraped(func(_ *colly.Response) { time.Sleep(5 * time.Second) - close(KlixArticles) - close(KlixApprovedSites) - close(KlixCandidates) + terminating := model.ScrapedArticle{} + terminating.Title = model.Terminator + KlixArticles <- terminating + KlixApprovedSites <- model.Terminator + KlixCandidates <- model.Terminator }) crHomePage.OnError(func(_ *colly.Response, _ error) { - close(KlixArticles) - close(KlixApprovedSites) - close(KlixCandidates) + time.Sleep(5 * time.Second) + terminating := model.ScrapedArticle{} + terminating.Title = model.Terminator + KlixArticles <- terminating + KlixApprovedSites <- model.Terminator + KlixCandidates <- model.Terminator }) go crHomePage.Visit("https://www.klix.ba") } -func setupArticlePageCrawler(crArticlePage *colly.Collector) { +func setupKlArticlePageCrawler(crArticlePage *colly.Collector) { crArticlePage.OnHTML("html", func(e *colly.HTMLElement) { url := e.Request.URL.String() diff --git a/internal/scraper/srpskainfo.go b/internal/scraper/srpskainfo.go new file mode 100644 index 0000000..3af1430 --- /dev/null +++ b/internal/scraper/srpskainfo.go @@ -0,0 +1,101 @@ +package scraper + +import ( + "fmt" + "github.com/gocolly/colly" + "github.com/gosimple/slug" + "gitlab.com/kbr4/svevijesti/internal/model" + "regexp" + "strings" + "time" +) + +var SrpskainfoArticles = make(chan model.ScrapedArticle) +var SrpskainfoCandidates = make(chan string) +var SrpskainfoApprovedSites = make(chan string, 2) + +func CrawlSrpskainfo() { + + crHomePage := colly.NewCollector(colly.AllowedDomains("srpskainfo.com")) + crArticlePage := colly.NewCollector(colly.AllowedDomains("srpskainfo.com")) + + setupSiArticlePageCrawler(crArticlePage) + setupSiHomepageCrawler(crHomePage, crArticlePage) + + go visitSiApprovedPages(crArticlePage) +} + +func visitSiApprovedPages(crArticlePage *colly.Collector) { + fmt.Println("Consuming sites!") + for url := range SrpskainfoApprovedSites { + fmt.Println("Visiting: ", url) + crArticlePage.Visit(url) + } +} + +func setupSiHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) { + + crHomePage.OnHTML("a", func(e *colly.HTMLElement) { + articleUrlR, _ := regexp.Compile("([A-Za-z0-9]+-){3,}([A-Za-z0-9]+)/$") + url := e.Attr("href") + completeUrl := url + if articleUrlR.MatchString(url) { + SrpskainfoCandidates <- completeUrl + } + }) + + crHomePage.OnScraped(func(_ *colly.Response) { + time.Sleep(5 * time.Second) + terminating := model.ScrapedArticle{} + terminating.Title = model.Terminator + SrpskainfoArticles <- terminating + SrpskainfoApprovedSites <- model.Terminator + SrpskainfoCandidates <- model.Terminator + }) + + crHomePage.OnError(func(_ *colly.Response, _ error) { + time.Sleep(5 * time.Second) + terminating := model.ScrapedArticle{} + terminating.Title = model.Terminator + SrpskainfoArticles <- terminating + SrpskainfoApprovedSites <- model.Terminator + SrpskainfoCandidates <- model.Terminator + }) + + go crHomePage.Visit("https://srpskainfo.com") +} + +func setupSiArticlePageCrawler(crArticlePage *colly.Collector) { + crArticlePage.OnHTML("html", func(e *colly.HTMLElement) { + + url := e.Request.URL.String() + + title := "" + e.ForEachWithBreak("h1", func(_ int, el *colly.HTMLElement) bool { + title = el.Text + return false + }) + + text := "" + + e.ForEach("p.article__top-content, p.article__content, h4.article__content, h3.article__content, h2.article__content, div.article__content", func(_ int, el *colly.HTMLElement) { + text += extractJustText(el.DOM) + }) + + article := model.ScrapedArticle{} + + trimmedText := strings.TrimSpace(text) + article.OriginalUrl = url + article.Title = title + article.Content = trimmedText + article.SourceId = model.SrpskainfoSource + article.Slug = slug.Make(title) + + SrpskainfoArticles <- article + }) + + crArticlePage.OnError(func(_ *colly.Response, _ error) { + fmt.Println("Problem crawling!") + }) + +} diff --git a/scripts/install_server.sh b/scripts/install_server.sh index ff5bc5e..71f1d3b 100644 --- a/scripts/install_server.sh +++ b/scripts/install_server.sh @@ -3,4 +3,5 @@ sudo systemctl stop starenovine sudo cp ./server /opt/starenovine/server sudo cp -R ./web /opt/starenovine/ +sudo cp ./spider /opt/starenovine/spider sudo systemctl start starenovine diff --git a/server b/server index 371387f..2b1774d 100755 Binary files a/server and b/server differ diff --git a/spider b/spider index 2e529f1..088c49f 100755 Binary files a/spider and b/spider differ