Prva verzija - klix scraper
This commit is contained in:
7
Makefile
Normal file
7
Makefile
Normal file
@@ -0,0 +1,7 @@
|
||||
migrateup:
|
||||
migrate -path db/migrations -database "postgresql://svevijesti:salmonela%20pljusti%20221%20hamo@localhost:5432/svevijestiweb?sslmode=disable" -verbose up
|
||||
|
||||
migratedown:
|
||||
migrate -path db/migrations -database "postgresql://svevijesti:salmonela%20pljusti%20221%20hamo@localhost:5432/svevijestiweb?sslmode=disable" -verbose down
|
||||
|
||||
.PHONY: migrateup migratedown
|
||||
46
cmd/spider/spider.go
Normal file
46
cmd/spider/spider.go
Normal file
@@ -0,0 +1,46 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/lib/pq"
|
||||
"gitlab.com/kbr4/svevijesti/internal/database"
|
||||
"gitlab.com/kbr4/svevijesti/internal/scraper"
|
||||
)
|
||||
|
||||
func main() {
|
||||
store, err := database.Connect()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
defer store.Close()
|
||||
go candidateChecker()
|
||||
go scraper.CrawlKlix()
|
||||
|
||||
for article := range scraper.KlixArticles {
|
||||
fmt.Println("Saving ", article.OriginalUrl)
|
||||
err = database.InsertArticle(store, article)
|
||||
if err, ok := err.(*pq.Error); ok {
|
||||
if err.Code.Name() != "unique_violation" {
|
||||
panic(err)
|
||||
} else {
|
||||
fmt.Println("Skipping: ", article.OriginalUrl)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func candidateChecker() {
|
||||
store, err := database.Connect()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
for url := range scraper.KlixCandidates {
|
||||
if !database.IsSaved(store, url) {
|
||||
scraper.KlixApprovedSites <- url
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
1
db/development.conf
Normal file
1
db/development.conf
Normal file
@@ -0,0 +1 @@
|
||||
postgresql://svevijesti:salmonela%20pljusti%20221%20hamo@localhost:5432/svevijestiweb?ssl_mode=disabled
|
||||
1
db/migrations/20220206054902_create_articles.down.sql
Normal file
1
db/migrations/20220206054902_create_articles.down.sql
Normal file
@@ -0,0 +1 @@
|
||||
DROP TABLE articles;
|
||||
16
db/migrations/20220206054902_create_articles.up.sql
Normal file
16
db/migrations/20220206054902_create_articles.up.sql
Normal file
@@ -0,0 +1,16 @@
|
||||
BEGIN;
|
||||
|
||||
CREATE TABLE "articles" (
|
||||
"id" bigint GENERATED ALWAYS AS IDENTITY,
|
||||
"title" text NOT NULL UNIQUE,
|
||||
"content" text NOT NULL,
|
||||
"slug" text NOT NULL UNIQUE,
|
||||
"created_at" timestamptz DEFAULT NOW() NOT NULL,
|
||||
"original_url" text NOT NULL UNIQUE,
|
||||
"source_id" int NOT NULL,
|
||||
CONSTRAINT "articles_pk" PRIMARY KEY ("id")
|
||||
) WITH (
|
||||
OIDS=FALSE
|
||||
);
|
||||
|
||||
COMMIT;
|
||||
21
go.mod
21
go.mod
@@ -1,3 +1,24 @@
|
||||
module gitlab.com/kbr4/svevijesti
|
||||
|
||||
go 1.17
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.8.0 // indirect
|
||||
github.com/andybalholm/cascadia v1.3.1 // indirect
|
||||
github.com/antchfx/htmlquery v1.2.4 // indirect
|
||||
github.com/antchfx/xmlquery v1.3.9 // indirect
|
||||
github.com/antchfx/xpath v1.2.0 // indirect
|
||||
github.com/gobwas/glob v0.2.3 // indirect
|
||||
github.com/gocolly/colly v1.2.0 // indirect
|
||||
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect
|
||||
github.com/golang/protobuf v1.3.1 // indirect
|
||||
github.com/gosimple/slug v1.12.0 // indirect
|
||||
github.com/gosimple/unidecode v1.0.1 // indirect
|
||||
github.com/kennygrant/sanitize v1.2.4 // indirect
|
||||
github.com/lib/pq v1.10.4 // indirect
|
||||
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
|
||||
github.com/temoto/robotstxt v1.1.2 // indirect
|
||||
golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd // indirect
|
||||
golang.org/x/text v0.3.7 // indirect
|
||||
google.golang.org/appengine v1.6.7 // indirect
|
||||
)
|
||||
|
||||
61
go.sum
Normal file
61
go.sum
Normal file
@@ -0,0 +1,61 @@
|
||||
github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U=
|
||||
github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI=
|
||||
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
|
||||
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
|
||||
github.com/antchfx/htmlquery v1.2.4 h1:qLteofCMe/KGovBI6SQgmou2QNyedFUW+pE+BpeZ494=
|
||||
github.com/antchfx/htmlquery v1.2.4/go.mod h1:2xO6iu3EVWs7R2JYqBbp8YzG50gj/ofqs5/0VZoDZLc=
|
||||
github.com/antchfx/xmlquery v1.3.9 h1:Y+zyMdiUZ4fasTQTkDb3DflOXP7+obcYEh80SISBmnQ=
|
||||
github.com/antchfx/xmlquery v1.3.9/go.mod h1:wojC/BxjEkjJt6dPiAqUzoXO5nIMWtxHS8PD8TmN4ks=
|
||||
github.com/antchfx/xpath v1.2.0 h1:mbwv7co+x0RwgeGAOHdrKy89GvHaGvxxBtPK0uF9Zr8=
|
||||
github.com/antchfx/xpath v1.2.0/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
|
||||
github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
|
||||
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
|
||||
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
|
||||
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
|
||||
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
|
||||
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY=
|
||||
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
|
||||
github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
|
||||
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
|
||||
github.com/gosimple/slug v1.12.0 h1:xzuhj7G7cGtd34NXnW/yF0l+AGNfWqwgh/IXgFy7dnc=
|
||||
github.com/gosimple/slug v1.12.0/go.mod h1:UiRaFH+GEilHstLUmcBgWcI42viBN7mAb818JrYOeFQ=
|
||||
github.com/gosimple/unidecode v1.0.1 h1:hZzFTMMqSswvf0LBJZCZgThIZrpDHFXux9KeGmn6T/o=
|
||||
github.com/gosimple/unidecode v1.0.1/go.mod h1:CP0Cr1Y1kogOtx0bJblKzsVWrqYaqfNOnHzpgWw4Awc=
|
||||
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
|
||||
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
|
||||
github.com/lib/pq v1.10.4 h1:SO9z7FRPzA03QhHKJrH5BXA6HU1rS4V2nIVrrNC1iYk=
|
||||
github.com/lib/pq v1.10.4/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
|
||||
github.com/mattn/go-sqlite3 v1.14.11 h1:gt+cp9c0XGqe9S/wAHTL3n/7MqY+siPWgWJgqdsFrzQ=
|
||||
github.com/mattn/go-sqlite3 v1.14.11/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI=
|
||||
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
|
||||
github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
|
||||
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
|
||||
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
|
||||
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
|
||||
golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
|
||||
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||
golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd h1:O7DYs+zxREGLKzKoMQrtrEacpb0ZVXA5rIwylE2Xchk=
|
||||
golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
|
||||
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
|
||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c=
|
||||
google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
|
||||
59
internal/database/database.go
Normal file
59
internal/database/database.go
Normal file
@@ -0,0 +1,59 @@
|
||||
package database
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
_ "github.com/lib/pq"
|
||||
"gitlab.com/kbr4/svevijesti/internal/model"
|
||||
)
|
||||
|
||||
const (
|
||||
host = "localhost"
|
||||
port = 5432
|
||||
user = "svevijesti"
|
||||
password = "salmonela pljusti 221 hamo"
|
||||
dbname = "svevijestiweb"
|
||||
)
|
||||
|
||||
type Store = sql.DB
|
||||
|
||||
func Connect() (*Store, error) {
|
||||
psqlInfo := fmt.Sprintf("host=%s port=%d user=%s "+
|
||||
"password='%s' dbname=%s sslmode=disable",
|
||||
host, port, user, password, dbname)
|
||||
db, err := sql.Open("postgres", psqlInfo)
|
||||
return db, err
|
||||
}
|
||||
|
||||
func InsertArticle(store *Store, article model.ScrapedArticle) (err error) {
|
||||
query := `
|
||||
INSERT INTO articles
|
||||
(title, content, slug, original_url, source_id)
|
||||
VALUES
|
||||
($1,$2,$3,$4,$5);`
|
||||
|
||||
_, err = store.Exec(query, article.Title, article.Content, article.Slug, article.OriginalUrl, article.SourceId)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func IsSaved(store *Store, url string) bool {
|
||||
|
||||
exists := false
|
||||
query, err := store.Prepare(`
|
||||
select exists(select 1 from articles where original_url = $1);
|
||||
`)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
row := query.QueryRow(url)
|
||||
err = row.Scan(&exists)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return exists
|
||||
}
|
||||
13
internal/model/model.go
Normal file
13
internal/model/model.go
Normal file
@@ -0,0 +1,13 @@
|
||||
package model
|
||||
|
||||
type ScrapedArticle struct {
|
||||
Title string
|
||||
Content string
|
||||
Slug string
|
||||
OriginalUrl string
|
||||
SourceId int
|
||||
}
|
||||
|
||||
const (
|
||||
KlixSource = 1
|
||||
)
|
||||
96
internal/scraper/klix.go
Normal file
96
internal/scraper/klix.go
Normal file
@@ -0,0 +1,96 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/gocolly/colly"
|
||||
"github.com/gosimple/slug"
|
||||
"gitlab.com/kbr4/svevijesti/internal/model"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var KlixArticles = make(chan model.ScrapedArticle)
|
||||
var KlixCandidates = make(chan string)
|
||||
var KlixApprovedSites = make(chan string, 2)
|
||||
|
||||
func CrawlKlix() {
|
||||
|
||||
crHomePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
|
||||
crArticlePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba"))
|
||||
|
||||
setupArticlePageCrawler(crArticlePage)
|
||||
setupHomepageCrawler(crHomePage, crArticlePage)
|
||||
|
||||
go visitApprovedPages(crArticlePage)
|
||||
}
|
||||
|
||||
func visitApprovedPages(crArticlePage *colly.Collector) {
|
||||
fmt.Println("Consuming sites!")
|
||||
for url := range KlixApprovedSites {
|
||||
fmt.Println("Visiting: ", url)
|
||||
crArticlePage.Visit(url)
|
||||
}
|
||||
}
|
||||
|
||||
func setupHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) {
|
||||
|
||||
articleUrlR, _ := regexp.Compile("\\d\\d+$")
|
||||
crHomePage.OnHTML(".container a", func(e *colly.HTMLElement) {
|
||||
url := e.Attr("href")
|
||||
completeUrl := "https://www.klix.ba" + url
|
||||
if articleUrlR.MatchString(url) {
|
||||
KlixCandidates <- completeUrl
|
||||
}
|
||||
})
|
||||
|
||||
crHomePage.OnScraped(func(_ *colly.Response) {
|
||||
time.Sleep(5 * time.Second)
|
||||
close(KlixArticles)
|
||||
close(KlixApprovedSites)
|
||||
close(KlixCandidates)
|
||||
})
|
||||
|
||||
crHomePage.OnError(func(_ *colly.Response, _ error) {
|
||||
close(KlixArticles)
|
||||
close(KlixApprovedSites)
|
||||
close(KlixCandidates)
|
||||
})
|
||||
|
||||
go crHomePage.Visit("https://www.klix.ba")
|
||||
}
|
||||
|
||||
func setupArticlePageCrawler(crArticlePage *colly.Collector) {
|
||||
crArticlePage.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
|
||||
url := e.Request.URL.String()
|
||||
|
||||
title := ""
|
||||
e.ForEachWithBreak("title", func(_ int, el *colly.HTMLElement) bool {
|
||||
title = el.Text
|
||||
return false
|
||||
})
|
||||
|
||||
text := ""
|
||||
|
||||
e.ForEach("div#text, p.lead", func(_ int, el *colly.HTMLElement) {
|
||||
text += extractJustText(el.DOM)
|
||||
})
|
||||
|
||||
article := model.ScrapedArticle{}
|
||||
|
||||
trimmedText := strings.TrimSpace(text)
|
||||
article.OriginalUrl = url
|
||||
article.Title = title
|
||||
article.Content = trimmedText
|
||||
article.SourceId = model.KlixSource
|
||||
article.Slug = slug.Make(title)
|
||||
|
||||
KlixArticles <- article
|
||||
})
|
||||
|
||||
crArticlePage.OnError(func(_ *colly.Response, _ error) {
|
||||
fmt.Println("Problem crawling!")
|
||||
})
|
||||
|
||||
}
|
||||
26
internal/scraper/scraper.go
Normal file
26
internal/scraper/scraper.go
Normal file
@@ -0,0 +1,26 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
func extractJustText(el *goquery.Selection) string {
|
||||
textPart := ""
|
||||
htmlPart, _ := el.Html()
|
||||
if len(el.Nodes) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
//fmt.Println("Checking: ", htmlPart, "Duzina: ", strconv.Itoa(len(el.Nodes)), " Type je ", el.Nodes[0].Type, " jednakost ", el.Text() == htmlPart)
|
||||
if len(el.Nodes) == 1 && el.Text() == htmlPart {
|
||||
return el.Text() + "\n"
|
||||
}
|
||||
|
||||
el.Children().Each(func(_ int, el2 *goquery.Selection) {
|
||||
if el2.Is("div, p, span, a") {
|
||||
textPart += extractJustText(el2)
|
||||
}
|
||||
})
|
||||
|
||||
return textPart
|
||||
}
|
||||
1
internal/server/server.go
Normal file
1
internal/server/server.go
Normal file
@@ -0,0 +1 @@
|
||||
package server
|
||||
Reference in New Issue
Block a user