Added bljesak.info
This commit is contained in:
@@ -18,6 +18,7 @@ func main() {
|
|||||||
go candidateChecker()
|
go candidateChecker()
|
||||||
go scraper.CrawlSrpskainfo()
|
go scraper.CrawlSrpskainfo()
|
||||||
go scraper.CrawlKlix()
|
go scraper.CrawlKlix()
|
||||||
|
go scraper.CrawlBljesak()
|
||||||
|
|
||||||
article := model.ScrapedArticle{}
|
article := model.ScrapedArticle{}
|
||||||
|
|
||||||
@@ -31,6 +32,11 @@ func main() {
|
|||||||
if article.Title == model.Terminator {
|
if article.Title == model.Terminator {
|
||||||
scraper.SrpskainfoArticles = nil
|
scraper.SrpskainfoArticles = nil
|
||||||
}
|
}
|
||||||
|
case article = <-scraper.BljesakArticles:
|
||||||
|
if article.Title == model.Terminator {
|
||||||
|
scraper.BljesakArticles = nil
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if article.Title != model.Terminator {
|
if article.Title != model.Terminator {
|
||||||
@@ -41,11 +47,15 @@ func main() {
|
|||||||
panic(err)
|
panic(err)
|
||||||
} else {
|
} else {
|
||||||
fmt.Println("Skipping: ", article.OriginalUrl)
|
fmt.Println("Skipping: ", article.OriginalUrl)
|
||||||
|
fmt.Println("Title ", article.Title)
|
||||||
|
fmt.Println("Error ", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if scraper.KlixArticles == nil && scraper.SrpskainfoArticles == nil {
|
if scraper.KlixArticles == nil &&
|
||||||
|
scraper.SrpskainfoArticles == nil &&
|
||||||
|
scraper.BljesakCandidates == nil {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -77,9 +87,21 @@ func candidateChecker() {
|
|||||||
scraper.SrpskainfoApprovedSites <- url
|
scraper.SrpskainfoApprovedSites <- url
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case url := <-scraper.BljesakCandidates:
|
||||||
|
if url == model.Terminator {
|
||||||
|
scraper.BljesakCandidates = nil
|
||||||
|
} else {
|
||||||
|
if !database.IsSaved(store, url) {
|
||||||
|
scraper.BljesakApprovedSites <- url
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if scraper.KlixCandidates == nil && scraper.SrpskainfoCandidates == nil {
|
if scraper.KlixCandidates == nil &&
|
||||||
|
scraper.SrpskainfoCandidates == nil &&
|
||||||
|
scraper.BljesakCandidates == nil {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,2 @@
|
|||||||
|
ALTER TABLE articles
|
||||||
|
ADD CONSTRAINT articles_title_key UNIQUE (title);
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
ALTER TABLE articles
|
||||||
|
DROP CONSTRAINT articles_title_key;
|
||||||
@@ -28,6 +28,7 @@ type DisplayArticle struct {
|
|||||||
const (
|
const (
|
||||||
KlixSource = 1
|
KlixSource = 1
|
||||||
SrpskainfoSource = 2
|
SrpskainfoSource = 2
|
||||||
|
BljesakSource = 3
|
||||||
)
|
)
|
||||||
|
|
||||||
func SourceName(sourceId int) string {
|
func SourceName(sourceId int) string {
|
||||||
@@ -36,6 +37,8 @@ func SourceName(sourceId int) string {
|
|||||||
return "klix"
|
return "klix"
|
||||||
case SrpskainfoSource:
|
case SrpskainfoSource:
|
||||||
return "srpskainfo"
|
return "srpskainfo"
|
||||||
|
case BljesakSource:
|
||||||
|
return "bljesak"
|
||||||
}
|
}
|
||||||
return "starenovine"
|
return "starenovine"
|
||||||
}
|
}
|
||||||
|
|||||||
104
internal/scraper/blijesak.go
Normal file
104
internal/scraper/blijesak.go
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
package scraper
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"github.com/gocolly/colly"
|
||||||
|
"github.com/gosimple/slug"
|
||||||
|
"gitlab.com/kbr4/svevijesti/internal/model"
|
||||||
|
"math/rand"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
var BljesakArticles = make(chan model.ScrapedArticle)
|
||||||
|
var BljesakCandidates = make(chan string)
|
||||||
|
var BljesakApprovedSites = make(chan string, 2)
|
||||||
|
|
||||||
|
func CrawlBljesak() {
|
||||||
|
|
||||||
|
crHomePage := colly.NewCollector(colly.AllowedDomains("bljesak.info"))
|
||||||
|
crArticlePage := colly.NewCollector(colly.AllowedDomains("bljesak.info"))
|
||||||
|
|
||||||
|
setupBljesakArticlePageCrawler(crArticlePage)
|
||||||
|
setupBljesakHomepageCrawler(crHomePage, crArticlePage)
|
||||||
|
|
||||||
|
go visitBljesakApprovedPages(crArticlePage)
|
||||||
|
}
|
||||||
|
|
||||||
|
func visitBljesakApprovedPages(crArticlePage *colly.Collector) {
|
||||||
|
fmt.Println("Consuming sites!")
|
||||||
|
for url := range BljesakApprovedSites {
|
||||||
|
fmt.Println("Visiting: ", url)
|
||||||
|
crArticlePage.Visit(url)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func setupBljesakHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
|
||||||
|
|
||||||
|
articleUrlR, _ := regexp.Compile("\\d\\d+$")
|
||||||
|
articleBlacklist, _ := regexp.Compile("(info-vodic|foto-data)")
|
||||||
|
crHomePage.OnHTML("a", func(e *colly.HTMLElement) {
|
||||||
|
url := e.Attr("href")
|
||||||
|
completeUrl := url
|
||||||
|
if articleUrlR.MatchString(url) && !articleBlacklist.MatchString(url) {
|
||||||
|
BljesakCandidates <- completeUrl
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
crHomePage.OnScraped(func(_ *colly.Response) {
|
||||||
|
time.Sleep(5 * time.Second)
|
||||||
|
terminating := model.ScrapedArticle{}
|
||||||
|
terminating.Title = model.Terminator
|
||||||
|
BljesakArticles <- terminating
|
||||||
|
BljesakApprovedSites <- model.Terminator
|
||||||
|
BljesakCandidates <- model.Terminator
|
||||||
|
})
|
||||||
|
|
||||||
|
crHomePage.OnError(func(_ *colly.Response, _ error) {
|
||||||
|
time.Sleep(5 * time.Second)
|
||||||
|
terminating := model.ScrapedArticle{}
|
||||||
|
terminating.Title = model.Terminator
|
||||||
|
BljesakArticles <- terminating
|
||||||
|
BljesakApprovedSites <- model.Terminator
|
||||||
|
BljesakCandidates <- model.Terminator
|
||||||
|
})
|
||||||
|
|
||||||
|
go crHomePage.Visit("https://bljesak.info")
|
||||||
|
}
|
||||||
|
|
||||||
|
func setupBljesakArticlePageCrawler(crArticlePage *colly.Collector) {
|
||||||
|
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
|
||||||
|
|
||||||
|
url := e.Request.URL.String()
|
||||||
|
|
||||||
|
title := ""
|
||||||
|
e.ForEachWithBreak("h1.title, h3.title", func(_ int, el *colly.HTMLElement) bool {
|
||||||
|
title = el.Text
|
||||||
|
return false
|
||||||
|
})
|
||||||
|
|
||||||
|
text := ""
|
||||||
|
|
||||||
|
e.ForEach("div.col-article-content, div.intro, div.s-main-content", func(_ int, el *colly.HTMLElement) {
|
||||||
|
text += extractJustText(el.DOM)
|
||||||
|
})
|
||||||
|
|
||||||
|
article := model.ScrapedArticle{}
|
||||||
|
|
||||||
|
trimmedText := strings.TrimSpace(text)
|
||||||
|
article.OriginalUrl = url
|
||||||
|
article.Title = title
|
||||||
|
article.Content = trimmedText
|
||||||
|
article.SourceId = model.BljesakSource
|
||||||
|
slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title)
|
||||||
|
article.Slug = slug.Make(slugBase)
|
||||||
|
|
||||||
|
BljesakArticles <- article
|
||||||
|
})
|
||||||
|
|
||||||
|
crArticlePage.OnError(func(_ *colly.Response, err error) {
|
||||||
|
fmt.Println("Problem crawling!", err)
|
||||||
|
})
|
||||||
|
|
||||||
|
}
|
||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"github.com/gocolly/colly"
|
"github.com/gocolly/colly"
|
||||||
"github.com/gosimple/slug"
|
"github.com/gosimple/slug"
|
||||||
"gitlab.com/kbr4/svevijesti/internal/model"
|
"gitlab.com/kbr4/svevijesti/internal/model"
|
||||||
|
"math/rand"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
@@ -89,7 +90,8 @@ func setupKlArticlePageCrawler(crArticlePage *colly.Collector) {
|
|||||||
article.Title = title
|
article.Title = title
|
||||||
article.Content = trimmedText
|
article.Content = trimmedText
|
||||||
article.SourceId = model.KlixSource
|
article.SourceId = model.KlixSource
|
||||||
article.Slug = slug.Make(title)
|
slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title)
|
||||||
|
article.Slug = slug.Make(slugBase)
|
||||||
|
|
||||||
KlixArticles <- article
|
KlixArticles <- article
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"github.com/gocolly/colly"
|
"github.com/gocolly/colly"
|
||||||
"github.com/gosimple/slug"
|
"github.com/gosimple/slug"
|
||||||
"gitlab.com/kbr4/svevijesti/internal/model"
|
"gitlab.com/kbr4/svevijesti/internal/model"
|
||||||
|
"math/rand"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
@@ -89,7 +90,8 @@ func setupSiArticlePageCrawler(crArticlePage *colly.Collector) {
|
|||||||
article.Title = title
|
article.Title = title
|
||||||
article.Content = trimmedText
|
article.Content = trimmedText
|
||||||
article.SourceId = model.SrpskainfoSource
|
article.SourceId = model.SrpskainfoSource
|
||||||
article.Slug = slug.Make(title)
|
slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title)
|
||||||
|
article.Slug = slug.Make(slugBase)
|
||||||
|
|
||||||
SrpskainfoArticles <- article
|
SrpskainfoArticles <- article
|
||||||
})
|
})
|
||||||
|
|||||||
Reference in New Issue
Block a user