Prva verzija - klix scraper
This commit is contained in:
59
internal/database/database.go
Normal file
59
internal/database/database.go
Normal file
@@ -0,0 +1,59 @@
|
||||
package database
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
_ "github.com/lib/pq"
|
||||
"gitlab.com/kbr4/svevijesti/internal/model"
|
||||
)
|
||||
|
||||
const (
|
||||
host = "localhost"
|
||||
port = 5432
|
||||
user = "svevijesti"
|
||||
password = "salmonela pljusti 221 hamo"
|
||||
dbname = "svevijestiweb"
|
||||
)
|
||||
|
||||
type Store = sql.DB
|
||||
|
||||
func Connect() (*Store, error) {
|
||||
psqlInfo := fmt.Sprintf("host=%s port=%d user=%s "+
|
||||
"password='%s' dbname=%s sslmode=disable",
|
||||
host, port, user, password, dbname)
|
||||
db, err := sql.Open("postgres", psqlInfo)
|
||||
return db, err
|
||||
}
|
||||
|
||||
func InsertArticle(store *Store, article model.ScrapedArticle) (err error) {
|
||||
query := `
|
||||
INSERT INTO articles
|
||||
(title, content, slug, original_url, source_id)
|
||||
VALUES
|
||||
($1,$2,$3,$4,$5);`
|
||||
|
||||
_, err = store.Exec(query, article.Title, article.Content, article.Slug, article.OriginalUrl, article.SourceId)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func IsSaved(store *Store, url string) bool {
|
||||
|
||||
exists := false
|
||||
query, err := store.Prepare(`
|
||||
select exists(select 1 from articles where original_url = $1);
|
||||
`)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
row := query.QueryRow(url)
|
||||
err = row.Scan(&exists)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return exists
|
||||
}
|
||||
13
internal/model/model.go
Normal file
13
internal/model/model.go
Normal file
@@ -0,0 +1,13 @@
|
||||
package model
|
||||
|
||||
type ScrapedArticle struct {
|
||||
Title string
|
||||
Content string
|
||||
Slug string
|
||||
OriginalUrl string
|
||||
SourceId int
|
||||
}
|
||||
|
||||
const (
|
||||
KlixSource = 1
|
||||
)
|
||||
96
internal/scraper/klix.go
Normal file
96
internal/scraper/klix.go
Normal file
@@ -0,0 +1,96 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/gocolly/colly"
|
||||
"github.com/gosimple/slug"
|
||||
"gitlab.com/kbr4/svevijesti/internal/model"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var KlixArticles = make(chan model.ScrapedArticle)
|
||||
var KlixCandidates = make(chan string)
|
||||
var KlixApprovedSites = make(chan string, 2)
|
||||
|
||||
func CrawlKlix() {
|
||||
|
||||
crHomePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
|
||||
crArticlePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
|
||||
|
||||
setupArticlePageCrawler(crArticlePage)
|
||||
setupHomepageCrawler(crHomePage, crArticlePage)
|
||||
|
||||
go visitApprovedPages(crArticlePage)
|
||||
}
|
||||
|
||||
func visitApprovedPages(crArticlePage *colly.Collector) {
|
||||
fmt.Println("Consuming sites!")
|
||||
for url := range KlixApprovedSites {
|
||||
fmt.Println("Visiting: ", url)
|
||||
crArticlePage.Visit(url)
|
||||
}
|
||||
}
|
||||
|
||||
func setupHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
|
||||
|
||||
articleUrlR, _ := regexp.Compile("\\d\\d+$")
|
||||
crHomePage.OnHTML(".container a", func(e *colly.HTMLElement) {
|
||||
url := e.Attr("href")
|
||||
completeUrl := "https://www.klix.ba" + url
|
||||
if articleUrlR.MatchString(url) {
|
||||
KlixCandidates <- completeUrl
|
||||
}
|
||||
})
|
||||
|
||||
crHomePage.OnScraped(func(_ *colly.Response) {
|
||||
time.Sleep(5 * time.Second)
|
||||
close(KlixArticles)
|
||||
close(KlixApprovedSites)
|
||||
close(KlixCandidates)
|
||||
})
|
||||
|
||||
crHomePage.OnError(func(_ *colly.Response, _ error) {
|
||||
close(KlixArticles)
|
||||
close(KlixApprovedSites)
|
||||
close(KlixCandidates)
|
||||
})
|
||||
|
||||
go crHomePage.Visit("https://www.klix.ba")
|
||||
}
|
||||
|
||||
func setupArticlePageCrawler(crArticlePage *colly.Collector) {
|
||||
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
|
||||
url := e.Request.URL.String()
|
||||
|
||||
title := ""
|
||||
e.ForEachWithBreak("title", func(_ int, el *colly.HTMLElement) bool {
|
||||
title = el.Text
|
||||
return false
|
||||
})
|
||||
|
||||
text := ""
|
||||
|
||||
e.ForEach("div#text, p.lead", func(_ int, el *colly.HTMLElement) {
|
||||
text += extractJustText(el.DOM)
|
||||
})
|
||||
|
||||
article := model.ScrapedArticle{}
|
||||
|
||||
trimmedText := strings.TrimSpace(text)
|
||||
article.OriginalUrl = url
|
||||
article.Title = title
|
||||
article.Content = trimmedText
|
||||
article.SourceId = model.KlixSource
|
||||
article.Slug = slug.Make(title)
|
||||
|
||||
KlixArticles <- article
|
||||
})
|
||||
|
||||
crArticlePage.OnError(func(_ *colly.Response, _ error) {
|
||||
fmt.Println("Problem crawling!")
|
||||
})
|
||||
|
||||
}
|
||||
26
internal/scraper/scraper.go
Normal file
26
internal/scraper/scraper.go
Normal file
@@ -0,0 +1,26 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
func extractJustText(el *goquery.Selection) string {
|
||||
textPart := ""
|
||||
htmlPart, _ := el.Html()
|
||||
if len(el.Nodes) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
//fmt.Println("Checking: ", htmlPart, "Duzina: ", strconv.Itoa(len(el.Nodes)), " Type je ", el.Nodes[0].Type, " jednakost ", el.Text() == htmlPart)
|
||||
if len(el.Nodes) == 1 && el.Text() == htmlPart {
|
||||
return el.Text() + "\n"
|
||||
}
|
||||
|
||||
el.Children().Each(func(_ int, el2 *goquery.Selection) {
|
||||
if el2.Is("div, p, span, a") {
|
||||
textPart += extractJustText(el2)
|
||||
}
|
||||
})
|
||||
|
||||
return textPart
|
||||
}
|
||||
1
internal/server/server.go
Normal file
1
internal/server/server.go
Normal file
@@ -0,0 +1 @@
|
||||
package server
|
||||
Reference in New Issue
Block a user