Added avaz / fixed install script

This commit is contained in:
Senad Uka
2022-02-22 21:06:27 +01:00
parent 2db4b0b2e2
commit c8d361f458
6 changed files with 124 additions and 0 deletions

View File

@@ -19,6 +19,7 @@ func main() {
go scraper.CrawlSrpskainfo()
go scraper.CrawlKlix()
go scraper.CrawlBljesak()
go scraper.CrawlAvaz()
article := model.ScrapedArticle{}
@@ -36,6 +37,10 @@ func main() {
if article.Title == model.Terminator {
scraper.BljesakArticles = nil
}
case article = <-scraper.AvazArticles:
if article.Title == model.Terminator {
scraper.AvazArticles = nil
}
}
@@ -55,6 +60,7 @@ func main() {
if scraper.KlixArticles == nil &&
scraper.SrpskainfoArticles == nil &&
scraper.AvazArticles == nil &&
scraper.BljesakCandidates == nil {
break
}
@@ -97,10 +103,20 @@ func candidateChecker() {
}
}
case url := <-scraper.AvazCandidates:
if url == model.Terminator {
scraper.AvazCandidates = nil
} else {
if !database.IsSaved(store, url) {
scraper.AvazApprovedSites <- url
}
}
}
if scraper.KlixCandidates == nil &&
scraper.SrpskainfoCandidates == nil &&
scraper.AvazCandidates == nil &&
scraper.BljesakCandidates == nil {
break
}

View File

@@ -29,6 +29,7 @@ const (
KlixSource = 1
SrpskainfoSource = 2
BljesakSource = 3
AvazSource = 4
)
func SourceName(sourceId int) string {
@@ -39,6 +40,8 @@ func SourceName(sourceId int) string {
return "srpskainfo"
case BljesakSource:
return "bljesak"
case AvazSource:
return "avaz"
}
return "starenovine"
}

104
internal/scraper/avaz.go Normal file
View File

@@ -0,0 +1,104 @@
package scraper
import (
"fmt"
"github.com/gocolly/colly"
"github.com/gosimple/slug"
"gitlab.com/kbr4/svevijesti/internal/model"
"math/rand"
"regexp"
"strings"
"time"
)
var AvazArticles = make(chan model.ScrapedArticle)
var AvazCandidates = make(chan string)
var AvazApprovedSites = make(chan string, 2)
func CrawlAvaz() {
crHomePage := colly.NewCollector(colly.AllowedDomains("avaz.ba"))
crArticlePage := colly.NewCollector(colly.AllowedDomains("avaz.ba"))
setupAvazArticlePageCrawler(crArticlePage)
setupAvazHomepageCrawler(crHomePage, crArticlePage)
go visitAvazApprovedPages(crArticlePage)
}
func visitAvazApprovedPages(crArticlePage *colly.Collector) {
fmt.Println("Consuming sites!")
for url := range AvazApprovedSites {
fmt.Println("Visiting: ", url)
crArticlePage.Visit(url)
}
}
func setupAvazHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
articleUrlR, _ := regexp.Compile("/\\d\\d+/([a-z0-9-]+)")
articleBlacklist, _ := regexp.Compile("(info-vodic|foto-data)")
crHomePage.OnHTML("a", func(e *colly.HTMLElement) {
url := e.Attr("href")
completeUrl := url
if articleUrlR.MatchString(url) && !articleBlacklist.MatchString(url) {
AvazCandidates <- completeUrl
}
})
crHomePage.OnScraped(func(_ *colly.Response) {
time.Sleep(5 * time.Second)
terminating := model.ScrapedArticle{}
terminating.Title = model.Terminator
AvazArticles <- terminating
AvazApprovedSites <- model.Terminator
AvazCandidates <- model.Terminator
})
crHomePage.OnError(func(_ *colly.Response, _ error) {
time.Sleep(5 * time.Second)
terminating := model.ScrapedArticle{}
terminating.Title = model.Terminator
AvazArticles <- terminating
AvazApprovedSites <- model.Terminator
AvazCandidates <- model.Terminator
})
go crHomePage.Visit("https://avaz.ba")
}
func setupAvazArticlePageCrawler(crArticlePage *colly.Collector) {
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
url := e.Request.URL.String()
title := ""
e.ForEachWithBreak("h1.title, h3.title", func(_ int, el *colly.HTMLElement) bool {
title = el.Text
return false
})
text := ""
e.ForEach("p.podtitle, div.artikal-text", func(_ int, el *colly.HTMLElement) {
text += extractJustText(el.DOM)
})
article := model.ScrapedArticle{}
trimmedText := strings.TrimSpace(text)
article.OriginalUrl = url
article.Title = title
article.Content = trimmedText
article.SourceId = model.AvazSource
slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title)
article.Slug = slug.Make(slugBase)
AvazArticles <- article
})
crArticlePage.OnError(func(_ *colly.Response, err error) {
fmt.Println("Problem crawling!", err)
})
}

View File

@@ -3,5 +3,6 @@
sudo systemctl stop starenovine
sudo cp ./server /opt/starenovine/server
sudo cp -R ./web /opt/starenovine/
sudo killall spider
sudo cp ./spider /opt/starenovine/spider
sudo systemctl start starenovine

BIN
server

Binary file not shown.

BIN
spider

Binary file not shown.