From 5f22302e441154e4ce563e43446e5760ad818b81 Mon Sep 17 00:00:00 2001 From: Senad Uka Date: Thu, 17 Feb 2022 18:58:16 +0100 Subject: [PATCH] Fix scrapers --- internal/scraper/blijesak.go | 2 +- internal/scraper/scraper.go | 2 +- internal/scraper/srpskainfo.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/scraper/blijesak.go b/internal/scraper/blijesak.go index 098ea3b..59860cc 100644 --- a/internal/scraper/blijesak.go +++ b/internal/scraper/blijesak.go @@ -80,7 +80,7 @@ func setupBljesakArticlePageCrawler(crArticlePage *colly.Collector) { text := "" - e.ForEach("article.b-article-detail, div#infiniteLoadBreakpoint, div.col-article-content, div.intro, div.s-main-content", func(_ int, el *colly.HTMLElement) { + e.ForEach("div.col-xs-12, article.b-article-detail, div#infiniteLoadBreakpoint, div.col-article-content, div.intro, div.s-main-content, h2, h3, h4, h5", func(_ int, el *colly.HTMLElement) { text += extractJustText(el.DOM) }) diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index e825480..4618330 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -17,7 +17,7 @@ func extractJustText(el *goquery.Selection) string { } el.Children().Each(func(_ int, el2 *goquery.Selection) { - if el2.Is("div, p, span, a") { + if el2.Is("div, p, span, a, h2, h3, h4, b, i") { textPart += extractJustText(el2) } }) diff --git a/internal/scraper/srpskainfo.go b/internal/scraper/srpskainfo.go index da6516d..15198ff 100644 --- a/internal/scraper/srpskainfo.go +++ b/internal/scraper/srpskainfo.go @@ -79,7 +79,7 @@ func setupSiArticlePageCrawler(crArticlePage *colly.Collector) { text := "" - e.ForEach("p.article__top-content, p.article__content, h4.article__content, h3.article__content, h2.article__content, div.article__content", func(_ int, el *colly.HTMLElement) { + e.ForEach("div.article__top-content, div.article__content, h4, h3, h2, div.article__content", func(_ int, el *colly.HTMLElement) { text += extractJustText(el.DOM) })