Fresh
This commit is contained in:
104
internal/scraper/avaz.go
Normal file
104
internal/scraper/avaz.go
Normal file
@@ -0,0 +1,104 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/gocolly/colly"
|
||||
"github.com/gosimple/slug"
|
||||
"gitlab.com/kbr4/svevijesti/internal/model"
|
||||
"math/rand"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var AvazArticles = make(chan model.ScrapedArticle)
|
||||
var AvazCandidates = make(chan string)
|
||||
var AvazApprovedSites = make(chan string, 2)
|
||||
|
||||
func CrawlAvaz() {
|
||||
|
||||
crHomePage := colly.NewCollector(colly.AllowedDomains("avaz.ba"))
|
||||
crArticlePage := colly.NewCollector(colly.AllowedDomains("avaz.ba"))
|
||||
|
||||
setupAvazArticlePageCrawler(crArticlePage)
|
||||
setupAvazHomepageCrawler(crHomePage, crArticlePage)
|
||||
|
||||
go visitAvazApprovedPages(crArticlePage)
|
||||
}
|
||||
|
||||
func visitAvazApprovedPages(crArticlePage *colly.Collector) {
|
||||
fmt.Println("Consuming sites!")
|
||||
for url := range AvazApprovedSites {
|
||||
fmt.Println("Visiting: ", url)
|
||||
crArticlePage.Visit(url)
|
||||
}
|
||||
}
|
||||
|
||||
func setupAvazHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
|
||||
|
||||
articleUrlR, _ := regexp.Compile("/\\d\\d+/([a-z0-9-]+)")
|
||||
articleBlacklist, _ := regexp.Compile("(english)")
|
||||
crHomePage.OnHTML("a", func(e *colly.HTMLElement) {
|
||||
url := e.Attr("href")
|
||||
completeUrl := url
|
||||
if articleUrlR.MatchString(url) && !articleBlacklist.MatchString(url) {
|
||||
AvazCandidates <- completeUrl
|
||||
}
|
||||
})
|
||||
|
||||
crHomePage.OnScraped(func(_ *colly.Response) {
|
||||
time.Sleep(5 * time.Second)
|
||||
terminating := model.ScrapedArticle{}
|
||||
terminating.Title = model.Terminator
|
||||
AvazArticles <- terminating
|
||||
AvazApprovedSites <- model.Terminator
|
||||
AvazCandidates <- model.Terminator
|
||||
})
|
||||
|
||||
crHomePage.OnError(func(_ *colly.Response, _ error) {
|
||||
time.Sleep(5 * time.Second)
|
||||
terminating := model.ScrapedArticle{}
|
||||
terminating.Title = model.Terminator
|
||||
AvazArticles <- terminating
|
||||
AvazApprovedSites <- model.Terminator
|
||||
AvazCandidates <- model.Terminator
|
||||
})
|
||||
|
||||
go crHomePage.Visit("https://avaz.ba")
|
||||
}
|
||||
|
||||
func setupAvazArticlePageCrawler(crArticlePage *colly.Collector) {
|
||||
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
|
||||
url := e.Request.URL.String()
|
||||
|
||||
title := ""
|
||||
e.ForEachWithBreak("h1.title, h3.title", func(_ int, el *colly.HTMLElement) bool {
|
||||
title = el.Text
|
||||
return false
|
||||
})
|
||||
|
||||
text := ""
|
||||
|
||||
e.ForEach("p.podtitle, div.artikal-text", func(_ int, el *colly.HTMLElement) {
|
||||
text += extractJustText(el.DOM)
|
||||
})
|
||||
|
||||
article := model.ScrapedArticle{}
|
||||
|
||||
trimmedText := strings.TrimSpace(text)
|
||||
article.OriginalUrl = url
|
||||
article.Title = title
|
||||
article.Content = trimmedText
|
||||
article.SourceId = model.AvazSource
|
||||
slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title)
|
||||
article.Slug = slug.Make(slugBase)
|
||||
|
||||
AvazArticles <- article
|
||||
})
|
||||
|
||||
crArticlePage.OnError(func(_ *colly.Response, err error) {
|
||||
fmt.Println("Problem crawling!", err)
|
||||
})
|
||||
|
||||
}
|
||||
104
internal/scraper/blijesak.go
Normal file
104
internal/scraper/blijesak.go
Normal file
@@ -0,0 +1,104 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/gocolly/colly"
|
||||
"github.com/gosimple/slug"
|
||||
"gitlab.com/kbr4/svevijesti/internal/model"
|
||||
"math/rand"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var BljesakArticles = make(chan model.ScrapedArticle)
|
||||
var BljesakCandidates = make(chan string)
|
||||
var BljesakApprovedSites = make(chan string, 2)
|
||||
|
||||
func CrawlBljesak() {
|
||||
|
||||
crHomePage := colly.NewCollector(colly.AllowedDomains("bljesak.info"))
|
||||
crArticlePage := colly.NewCollector(colly.AllowedDomains("bljesak.info"))
|
||||
|
||||
setupBljesakArticlePageCrawler(crArticlePage)
|
||||
setupBljesakHomepageCrawler(crHomePage, crArticlePage)
|
||||
|
||||
go visitBljesakApprovedPages(crArticlePage)
|
||||
}
|
||||
|
||||
func visitBljesakApprovedPages(crArticlePage *colly.Collector) {
|
||||
fmt.Println("Consuming sites!")
|
||||
for url := range BljesakApprovedSites {
|
||||
fmt.Println("Visiting: ", url)
|
||||
crArticlePage.Visit(url)
|
||||
}
|
||||
}
|
||||
|
||||
func setupBljesakHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
|
||||
|
||||
articleUrlR, _ := regexp.Compile("\\d\\d+$")
|
||||
articleBlacklist, _ := regexp.Compile("(info-vodic|foto-data)")
|
||||
crHomePage.OnHTML("a", func(e *colly.HTMLElement) {
|
||||
url := e.Attr("href")
|
||||
completeUrl := url
|
||||
if articleUrlR.MatchString(url) && !articleBlacklist.MatchString(url) {
|
||||
BljesakCandidates <- completeUrl
|
||||
}
|
||||
})
|
||||
|
||||
crHomePage.OnScraped(func(_ *colly.Response) {
|
||||
time.Sleep(5 * time.Second)
|
||||
terminating := model.ScrapedArticle{}
|
||||
terminating.Title = model.Terminator
|
||||
BljesakArticles <- terminating
|
||||
BljesakApprovedSites <- model.Terminator
|
||||
BljesakCandidates <- model.Terminator
|
||||
})
|
||||
|
||||
crHomePage.OnError(func(_ *colly.Response, _ error) {
|
||||
time.Sleep(5 * time.Second)
|
||||
terminating := model.ScrapedArticle{}
|
||||
terminating.Title = model.Terminator
|
||||
BljesakArticles <- terminating
|
||||
BljesakApprovedSites <- model.Terminator
|
||||
BljesakCandidates <- model.Terminator
|
||||
})
|
||||
|
||||
go crHomePage.Visit("https://bljesak.info")
|
||||
}
|
||||
|
||||
func setupBljesakArticlePageCrawler(crArticlePage *colly.Collector) {
|
||||
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
|
||||
url := e.Request.URL.String()
|
||||
|
||||
title := ""
|
||||
e.ForEachWithBreak("h1.title, h3.title", func(_ int, el *colly.HTMLElement) bool {
|
||||
title = el.Text
|
||||
return false
|
||||
})
|
||||
|
||||
text := ""
|
||||
|
||||
e.ForEach("div.intro, div#infiniteLoadBreakpoint", func(_ int, el *colly.HTMLElement) {
|
||||
text += extractJustText(el.DOM)
|
||||
})
|
||||
|
||||
article := model.ScrapedArticle{}
|
||||
|
||||
trimmedText := strings.TrimSpace(text)
|
||||
article.OriginalUrl = url
|
||||
article.Title = title
|
||||
article.Content = trimmedText
|
||||
article.SourceId = model.BljesakSource
|
||||
slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title)
|
||||
article.Slug = slug.Make(slugBase)
|
||||
|
||||
BljesakArticles <- article
|
||||
})
|
||||
|
||||
crArticlePage.OnError(func(_ *colly.Response, err error) {
|
||||
fmt.Println("Problem crawling!", err)
|
||||
})
|
||||
|
||||
}
|
||||
103
internal/scraper/klix.go
Normal file
103
internal/scraper/klix.go
Normal file
@@ -0,0 +1,103 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/gocolly/colly"
|
||||
"github.com/gosimple/slug"
|
||||
"gitlab.com/kbr4/svevijesti/internal/model"
|
||||
"math/rand"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var KlixArticles = make(chan model.ScrapedArticle)
|
||||
var KlixCandidates = make(chan string)
|
||||
var KlixApprovedSites = make(chan string, 2)
|
||||
|
||||
func CrawlKlix() {
|
||||
|
||||
crHomePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
|
||||
crArticlePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
|
||||
|
||||
setupKlArticlePageCrawler(crArticlePage)
|
||||
setupKlHomepageCrawler(crHomePage, crArticlePage)
|
||||
|
||||
go visitApprovedPages(crArticlePage)
|
||||
}
|
||||
|
||||
func visitApprovedPages(crArticlePage *colly.Collector) {
|
||||
fmt.Println("Consuming sites!")
|
||||
for url := range KlixApprovedSites {
|
||||
fmt.Println("Visiting: ", url)
|
||||
crArticlePage.Visit(url)
|
||||
}
|
||||
}
|
||||
|
||||
func setupKlHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
|
||||
|
||||
articleUrlR, _ := regexp.Compile("\\d\\d+$")
|
||||
crHomePage.OnHTML(".container a", func(e *colly.HTMLElement) {
|
||||
url := e.Attr("href")
|
||||
completeUrl := "https://www.klix.ba" + url
|
||||
if articleUrlR.MatchString(url) {
|
||||
KlixCandidates <- completeUrl
|
||||
}
|
||||
})
|
||||
|
||||
crHomePage.OnScraped(func(_ *colly.Response) {
|
||||
time.Sleep(5 * time.Second)
|
||||
terminating := model.ScrapedArticle{}
|
||||
terminating.Title = model.Terminator
|
||||
KlixArticles <- terminating
|
||||
KlixApprovedSites <- model.Terminator
|
||||
KlixCandidates <- model.Terminator
|
||||
})
|
||||
|
||||
crHomePage.OnError(func(_ *colly.Response, _ error) {
|
||||
time.Sleep(5 * time.Second)
|
||||
terminating := model.ScrapedArticle{}
|
||||
terminating.Title = model.Terminator
|
||||
KlixArticles <- terminating
|
||||
KlixApprovedSites <- model.Terminator
|
||||
KlixCandidates <- model.Terminator
|
||||
})
|
||||
|
||||
go crHomePage.Visit("https://www.klix.ba")
|
||||
}
|
||||
|
||||
func setupKlArticlePageCrawler(crArticlePage *colly.Collector) {
|
||||
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
|
||||
url := e.Request.URL.String()
|
||||
|
||||
title := ""
|
||||
e.ForEachWithBreak("title", func(_ int, el *colly.HTMLElement) bool {
|
||||
title = el.Text
|
||||
return false
|
||||
})
|
||||
|
||||
text := ""
|
||||
|
||||
e.ForEach("div#text, p.lead", func(_ int, el *colly.HTMLElement) {
|
||||
text += extractJustText(el.DOM)
|
||||
})
|
||||
|
||||
article := model.ScrapedArticle{}
|
||||
|
||||
trimmedText := strings.TrimSpace(text)
|
||||
article.OriginalUrl = url
|
||||
article.Title = title
|
||||
article.Content = trimmedText
|
||||
article.SourceId = model.KlixSource
|
||||
slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title)
|
||||
article.Slug = slug.Make(slugBase)
|
||||
|
||||
KlixArticles <- article
|
||||
})
|
||||
|
||||
crArticlePage.OnError(func(_ *colly.Response, _ error) {
|
||||
fmt.Println("Problem crawling!")
|
||||
})
|
||||
|
||||
}
|
||||
26
internal/scraper/scraper.go
Normal file
26
internal/scraper/scraper.go
Normal file
@@ -0,0 +1,26 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
func extractJustText(el *goquery.Selection) string {
|
||||
textPart := ""
|
||||
htmlPart, _ := el.Html()
|
||||
if len(el.Nodes) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
//fmt.Println("Checking: ", htmlPart, "Duzina: ", strconv.Itoa(len(el.Nodes)), " Type je ", el.Nodes[0].Type, " jednakost ", el.Text() == htmlPart)
|
||||
if el.Text() == htmlPart {
|
||||
return el.Text() + "\n"
|
||||
}
|
||||
|
||||
el.Children().Each(func(_ int, el2 *goquery.Selection) {
|
||||
if el2.Is("div, p, span, a") {
|
||||
textPart += extractJustText(el2)
|
||||
}
|
||||
})
|
||||
|
||||
return textPart
|
||||
}
|
||||
103
internal/scraper/srpskainfo.go
Normal file
103
internal/scraper/srpskainfo.go
Normal file
@@ -0,0 +1,103 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/gocolly/colly"
|
||||
"github.com/gosimple/slug"
|
||||
"gitlab.com/kbr4/svevijesti/internal/model"
|
||||
"math/rand"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var SrpskainfoArticles = make(chan model.ScrapedArticle)
|
||||
var SrpskainfoCandidates = make(chan string)
|
||||
var SrpskainfoApprovedSites = make(chan string, 2)
|
||||
|
||||
func CrawlSrpskainfo() {
|
||||
|
||||
crHomePage := colly.NewCollector(colly.AllowedDomains("srpskainfo.com"))
|
||||
crArticlePage := colly.NewCollector(colly.AllowedDomains("srpskainfo.com"))
|
||||
|
||||
setupSiArticlePageCrawler(crArticlePage)
|
||||
setupSiHomepageCrawler(crHomePage, crArticlePage)
|
||||
|
||||
go visitSiApprovedPages(crArticlePage)
|
||||
}
|
||||
|
||||
func visitSiApprovedPages(crArticlePage *colly.Collector) {
|
||||
fmt.Println("Consuming sites!")
|
||||
for url := range SrpskainfoApprovedSites {
|
||||
fmt.Println("Visiting: ", url)
|
||||
crArticlePage.Visit(url)
|
||||
}
|
||||
}
|
||||
|
||||
func setupSiHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
|
||||
|
||||
crHomePage.OnHTML("a", func(e *colly.HTMLElement) {
|
||||
articleUrlR, _ := regexp.Compile("([A-Za-z0-9]+-){3,}([A-Za-z0-9]+)/$")
|
||||
url := e.Attr("href")
|
||||
completeUrl := url
|
||||
if articleUrlR.MatchString(url) {
|
||||
SrpskainfoCandidates <- completeUrl
|
||||
}
|
||||
})
|
||||
|
||||
crHomePage.OnScraped(func(_ *colly.Response) {
|
||||
time.Sleep(5 * time.Second)
|
||||
terminating := model.ScrapedArticle{}
|
||||
terminating.Title = model.Terminator
|
||||
SrpskainfoArticles <- terminating
|
||||
SrpskainfoApprovedSites <- model.Terminator
|
||||
SrpskainfoCandidates <- model.Terminator
|
||||
})
|
||||
|
||||
crHomePage.OnError(func(_ *colly.Response, _ error) {
|
||||
time.Sleep(5 * time.Second)
|
||||
terminating := model.ScrapedArticle{}
|
||||
terminating.Title = model.Terminator
|
||||
SrpskainfoArticles <- terminating
|
||||
SrpskainfoApprovedSites <- model.Terminator
|
||||
SrpskainfoCandidates <- model.Terminator
|
||||
})
|
||||
|
||||
go crHomePage.Visit("https://srpskainfo.com")
|
||||
}
|
||||
|
||||
func setupSiArticlePageCrawler(crArticlePage *colly.Collector) {
|
||||
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
|
||||
url := e.Request.URL.String()
|
||||
|
||||
title := ""
|
||||
e.ForEachWithBreak("h1", func(_ int, el *colly.HTMLElement) bool {
|
||||
title = el.Text
|
||||
return false
|
||||
})
|
||||
|
||||
text := ""
|
||||
|
||||
e.ForEach("div.article__top-content, div.article__content", func(_ int, el *colly.HTMLElement) {
|
||||
text += extractJustText(el.DOM)
|
||||
})
|
||||
|
||||
article := model.ScrapedArticle{}
|
||||
|
||||
trimmedText := strings.TrimSpace(text)
|
||||
article.OriginalUrl = url
|
||||
article.Title = title
|
||||
article.Content = trimmedText
|
||||
article.SourceId = model.SrpskainfoSource
|
||||
slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title)
|
||||
article.Slug = slug.Make(slugBase)
|
||||
|
||||
SrpskainfoArticles <- article
|
||||
})
|
||||
|
||||
crArticlePage.OnError(func(_ *colly.Response, _ error) {
|
||||
fmt.Println("Problem crawling!")
|
||||
})
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user