Prva verzija - klix scraper

This commit is contained in:
Senad Uka
2022-02-10 21:11:13 +01:00
parent 08f0de07c3
commit a040320827
13 changed files with 348 additions and 0 deletions

View File

@@ -0,0 +1,59 @@
package database
import (
"database/sql"
"fmt"
_ "github.com/lib/pq"
"gitlab.com/kbr4/svevijesti/internal/model"
)
const (
host = "localhost"
port = 5432
user = "svevijesti"
password = "salmonela pljusti 221 hamo"
dbname = "svevijestiweb"
)
type Store = sql.DB
func Connect() (*Store, error) {
psqlInfo := fmt.Sprintf("host=%s port=%d user=%s "+
"password='%s' dbname=%s sslmode=disable",
host, port, user, password, dbname)
db, err := sql.Open("postgres", psqlInfo)
return db, err
}
func InsertArticle(store *Store, article model.ScrapedArticle) (err error) {
query := `
INSERT INTO articles
(title, content, slug, original_url, source_id)
VALUES
($1,$2,$3,$4,$5);`
_, err = store.Exec(query, article.Title, article.Content, article.Slug, article.OriginalUrl, article.SourceId)
if err != nil {
return err
}
return nil
}
func IsSaved(store *Store, url string) bool {
exists := false
query, err := store.Prepare(`
select exists(select 1 from articles where original_url = $1);
`)
if err != nil {
panic(err)
}
row := query.QueryRow(url)
err = row.Scan(&exists)
if err != nil {
panic(err)
}
return exists
}

13
internal/model/model.go Normal file
View File

@@ -0,0 +1,13 @@
package model
type ScrapedArticle struct {
Title string
Content string
Slug string
OriginalUrl string
SourceId int
}
const (
KlixSource = 1
)

96
internal/scraper/klix.go Normal file
View File

@@ -0,0 +1,96 @@
package scraper
import (
"fmt"
"github.com/gocolly/colly"
"github.com/gosimple/slug"
"gitlab.com/kbr4/svevijesti/internal/model"
"regexp"
"strings"
"time"
)
var KlixArticles = make(chan model.ScrapedArticle)
var KlixCandidates = make(chan string)
var KlixApprovedSites = make(chan string, 2)
func CrawlKlix() {
crHomePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
crArticlePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
setupArticlePageCrawler(crArticlePage)
setupHomepageCrawler(crHomePage, crArticlePage)
go visitApprovedPages(crArticlePage)
}
func visitApprovedPages(crArticlePage *colly.Collector) {
fmt.Println("Consuming sites!")
for url := range KlixApprovedSites {
fmt.Println("Visiting: ", url)
crArticlePage.Visit(url)
}
}
func setupHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
articleUrlR, _ := regexp.Compile("\\d\\d+$")
crHomePage.OnHTML(".container a", func(e *colly.HTMLElement) {
url := e.Attr("href")
completeUrl := "https://www.klix.ba" + url
if articleUrlR.MatchString(url) {
KlixCandidates <- completeUrl
}
})
crHomePage.OnScraped(func(_ *colly.Response) {
time.Sleep(5 * time.Second)
close(KlixArticles)
close(KlixApprovedSites)
close(KlixCandidates)
})
crHomePage.OnError(func(_ *colly.Response, _ error) {
close(KlixArticles)
close(KlixApprovedSites)
close(KlixCandidates)
})
go crHomePage.Visit("https://www.klix.ba")
}
func setupArticlePageCrawler(crArticlePage *colly.Collector) {
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
url := e.Request.URL.String()
title := ""
e.ForEachWithBreak("title", func(_ int, el *colly.HTMLElement) bool {
title = el.Text
return false
})
text := ""
e.ForEach("div#text, p.lead", func(_ int, el *colly.HTMLElement) {
text += extractJustText(el.DOM)
})
article := model.ScrapedArticle{}
trimmedText := strings.TrimSpace(text)
article.OriginalUrl = url
article.Title = title
article.Content = trimmedText
article.SourceId = model.KlixSource
article.Slug = slug.Make(title)
KlixArticles <- article
})
crArticlePage.OnError(func(_ *colly.Response, _ error) {
fmt.Println("Problem crawling!")
})
}

View File

@@ -0,0 +1,26 @@
package scraper
import (
"github.com/PuerkitoBio/goquery"
)
func extractJustText(el *goquery.Selection) string {
textPart := ""
htmlPart, _ := el.Html()
if len(el.Nodes) == 0 {
return ""
}
//fmt.Println("Checking: ", htmlPart, "Duzina: ", strconv.Itoa(len(el.Nodes)), " Type je ", el.Nodes[0].Type, " jednakost ", el.Text() == htmlPart)
if len(el.Nodes) == 1 && el.Text() == htmlPart {
return el.Text() + "\n"
}
el.Children().Each(func(_ int, el2 *goquery.Selection) {
if el2.Is("div, p, span, a") {
textPart += extractJustText(el2)
}
})
return textPart
}

View File

@@ -0,0 +1 @@
package server