diff --git a/cmd/spider/spider.go b/cmd/spider/spider.go index c133362..15066f5 100644 --- a/cmd/spider/spider.go +++ b/cmd/spider/spider.go @@ -18,6 +18,7 @@ func main() { go candidateChecker() go scraper.CrawlSrpskainfo() go scraper.CrawlKlix() + go scraper.CrawlBljesak() article := model.ScrapedArticle{} @@ -31,6 +32,11 @@ func main() { if article.Title == model.Terminator { scraper.SrpskainfoArticles = nil } + case article = <-scraper.BljesakArticles: + if article.Title == model.Terminator { + scraper.BljesakArticles = nil + } + } if article.Title != model.Terminator { @@ -41,11 +47,15 @@ func main() { panic(err) } else { fmt.Println("Skipping: ", article.OriginalUrl) + fmt.Println("Title ", article.Title) + fmt.Println("Error ", err) } } } - if scraper.KlixArticles == nil && scraper.SrpskainfoArticles == nil { + if scraper.KlixArticles == nil && + scraper.SrpskainfoArticles == nil && + scraper.BljesakCandidates == nil { break } } @@ -77,9 +87,21 @@ func candidateChecker() { scraper.SrpskainfoApprovedSites <- url } } + + case url := <-scraper.BljesakCandidates: + if url == model.Terminator { + scraper.BljesakCandidates = nil + } else { + if !database.IsSaved(store, url) { + scraper.BljesakApprovedSites <- url + } + } + } - if scraper.KlixCandidates == nil && scraper.SrpskainfoCandidates == nil { + if scraper.KlixCandidates == nil && + scraper.SrpskainfoCandidates == nil && + scraper.BljesakCandidates == nil { break } diff --git a/db/migrations/20220215164610_drop_title_constraint.down.sql b/db/migrations/20220215164610_drop_title_constraint.down.sql new file mode 100644 index 0000000..0a7ac76 --- /dev/null +++ b/db/migrations/20220215164610_drop_title_constraint.down.sql @@ -0,0 +1,2 @@ +ALTER TABLE articles +ADD CONSTRAINT articles_title_key UNIQUE (title); diff --git a/db/migrations/20220215164610_drop_title_constraint.up.sql b/db/migrations/20220215164610_drop_title_constraint.up.sql new file mode 100644 index 0000000..0610b42 --- /dev/null +++ b/db/migrations/20220215164610_drop_title_constraint.up.sql @@ -0,0 +1,2 @@ +ALTER TABLE articles +DROP CONSTRAINT articles_title_key; diff --git a/internal/model/model.go b/internal/model/model.go index f51525b..1030bb8 100644 --- a/internal/model/model.go +++ b/internal/model/model.go @@ -28,6 +28,7 @@ type DisplayArticle struct { const ( KlixSource = 1 SrpskainfoSource = 2 + BljesakSource = 3 ) func SourceName(sourceId int) string { @@ -36,6 +37,8 @@ func SourceName(sourceId int) string { return "klix" case SrpskainfoSource: return "srpskainfo" + case BljesakSource: + return "bljesak" } return "starenovine" } diff --git a/internal/scraper/blijesak.go b/internal/scraper/blijesak.go new file mode 100644 index 0000000..50b2182 --- /dev/null +++ b/internal/scraper/blijesak.go @@ -0,0 +1,104 @@ +package scraper + +import ( + "fmt" + "github.com/gocolly/colly" + "github.com/gosimple/slug" + "gitlab.com/kbr4/svevijesti/internal/model" + "math/rand" + "regexp" + "strings" + "time" +) + +var BljesakArticles = make(chan model.ScrapedArticle) +var BljesakCandidates = make(chan string) +var BljesakApprovedSites = make(chan string, 2) + +func CrawlBljesak() { + + crHomePage := colly.NewCollector(colly.AllowedDomains("bljesak.info")) + crArticlePage := colly.NewCollector(colly.AllowedDomains("bljesak.info")) + + setupBljesakArticlePageCrawler(crArticlePage) + setupBljesakHomepageCrawler(crHomePage, crArticlePage) + + go visitBljesakApprovedPages(crArticlePage) +} + +func visitBljesakApprovedPages(crArticlePage *colly.Collector) { + fmt.Println("Consuming sites!") + for url := range BljesakApprovedSites { + fmt.Println("Visiting: ", url) + crArticlePage.Visit(url) + } +} + +func setupBljesakHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) { + + articleUrlR, _ := regexp.Compile("\\d\\d+$") + articleBlacklist, _ := regexp.Compile("(info-vodic|foto-data)") + crHomePage.OnHTML("a", func(e *colly.HTMLElement) { + url := e.Attr("href") + completeUrl := url + if articleUrlR.MatchString(url) && !articleBlacklist.MatchString(url) { + BljesakCandidates <- completeUrl + } + }) + + crHomePage.OnScraped(func(_ *colly.Response) { + time.Sleep(5 * time.Second) + terminating := model.ScrapedArticle{} + terminating.Title = model.Terminator + BljesakArticles <- terminating + BljesakApprovedSites <- model.Terminator + BljesakCandidates <- model.Terminator + }) + + crHomePage.OnError(func(_ *colly.Response, _ error) { + time.Sleep(5 * time.Second) + terminating := model.ScrapedArticle{} + terminating.Title = model.Terminator + BljesakArticles <- terminating + BljesakApprovedSites <- model.Terminator + BljesakCandidates <- model.Terminator + }) + + go crHomePage.Visit("https://bljesak.info") +} + +func setupBljesakArticlePageCrawler(crArticlePage *colly.Collector) { + crArticlePage.OnHTML("html", func(e *colly.HTMLElement) { + + url := e.Request.URL.String() + + title := "" + e.ForEachWithBreak("h1.title, h3.title", func(_ int, el *colly.HTMLElement) bool { + title = el.Text + return false + }) + + text := "" + + e.ForEach("div.col-article-content, div.intro, div.s-main-content", func(_ int, el *colly.HTMLElement) { + text += extractJustText(el.DOM) + }) + + article := model.ScrapedArticle{} + + trimmedText := strings.TrimSpace(text) + article.OriginalUrl = url + article.Title = title + article.Content = trimmedText + article.SourceId = model.BljesakSource + slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title) + article.Slug = slug.Make(slugBase) + + BljesakArticles <- article + }) + + crArticlePage.OnError(func(_ *colly.Response, err error) { + fmt.Println("Problem crawling!", err) + }) + +} diff --git a/internal/scraper/klix.go b/internal/scraper/klix.go index 636996f..4edc616 100644 --- a/internal/scraper/klix.go +++ b/internal/scraper/klix.go @@ -5,6 +5,7 @@ import ( "github.com/gocolly/colly" "github.com/gosimple/slug" "gitlab.com/kbr4/svevijesti/internal/model" + "math/rand" "regexp" "strings" "time" @@ -89,7 +90,8 @@ func setupKlArticlePageCrawler(crArticlePage *colly.Collector) { article.Title = title article.Content = trimmedText article.SourceId = model.KlixSource - article.Slug = slug.Make(title) + slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title) + article.Slug = slug.Make(slugBase) KlixArticles <- article }) diff --git a/internal/scraper/srpskainfo.go b/internal/scraper/srpskainfo.go index 3af1430..da6516d 100644 --- a/internal/scraper/srpskainfo.go +++ b/internal/scraper/srpskainfo.go @@ -5,6 +5,7 @@ import ( "github.com/gocolly/colly" "github.com/gosimple/slug" "gitlab.com/kbr4/svevijesti/internal/model" + "math/rand" "regexp" "strings" "time" @@ -89,7 +90,8 @@ func setupSiArticlePageCrawler(crArticlePage *colly.Collector) { article.Title = title article.Content = trimmedText article.SourceId = model.SrpskainfoSource - article.Slug = slug.Make(title) + slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title) + article.Slug = slug.Make(slugBase) SrpskainfoArticles <- article }) diff --git a/server b/server index 2b1774d..0c3ea33 100755 Binary files a/server and b/server differ diff --git a/spider b/spider index 088c49f..77647f9 100755 Binary files a/spider and b/spider differ