commit 88741b230351875e1b1e8835d4fb0133ee926573 Author: Amir Sabani Date: Mon Dec 18 16:51:47 2023 +0100 Fresh diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..64b372d --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,49 @@ +# This file is a template, and might need editing before it works on your project. +# This is a sample GitLab CI/CD configuration file that should run without any modifications. +# It demonstrates a basic 3 stage CI/CD pipeline. Instead of real tests or scripts, +# it uses echo commands to simulate the pipeline execution. +# +# A pipeline is composed of independent jobs that run scripts, grouped into stages. +# Stages run in sequential order, but jobs within stages run in parallel. +# +# For more information, see: https://docs.gitlab.com/ee/ci/yaml/index.html#stages +# +# You can copy and paste this template into a new `.gitlab-ci.yml` file. +# You should not add this template to an existing `.gitlab-ci.yml` file by using the `include:` keyword. +# +# To contribute improvements to CI/CD templates, please follow the Development guide at: +# https://docs.gitlab.com/ee/development/cicd/templates.html +# This specific template is located at: +# https://gitlab.com/gitlab-org/gitlab/-/blob/master/lib/gitlab/ci/templates/Getting-Started.gitlab-ci.yml + +stages: # List of stages for jobs, and their order of execution + - build + - test + - deploy + +build-job: # This job runs in the build stage, which runs first. + stage: build + script: + - echo "Compiling the code..." + - echo "Compile complete." + +unit-test-job: # This job runs in the test stage. + stage: test # It only starts when the job in the build stage completes successfully. + script: + - echo "Running unit tests... This will take about 60 seconds." + - sleep 60 + - echo "Code coverage is 90%" + +lint-test-job: # This job also runs in the test stage. + stage: test # It can run at the same time as unit-test-job (in parallel). + script: + - echo "Linting code... This will take about 10 seconds." + - sleep 10 + - echo "No lint issues found." + +deploy-job: # This job runs in the deploy stage. + stage: deploy # It only runs when *both* jobs in the test stage complete successfully. + environment: production + script: + - echo "Deploying application..." + - echo "Application successfully deployed." diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0987a6b --- /dev/null +++ b/Makefile @@ -0,0 +1,10 @@ +migrateup: + migrate -path db/migrations -database "postgresql://svevijesti:salmonela%20pljusti%20221%20hamo@localhost:5432/svevijestiweb?sslmode=disable" -verbose up + +migratedown: + migrate -path db/migrations -database "postgresql://svevijesti:salmonela%20pljusti%20221%20hamo@localhost:5432/svevijestiweb?sslmode=disable" -verbose down + +installserver: + bash ./scripts/install_server.sh + +.PHONY: migrateup migratedown diff --git a/cmd/spider/spider.go b/cmd/spider/spider.go new file mode 100644 index 0000000..4824487 --- /dev/null +++ b/cmd/spider/spider.go @@ -0,0 +1,126 @@ +package main + +import ( + "fmt" + "github.com/lib/pq" + "gitlab.com/kbr4/svevijesti/internal/database" + "gitlab.com/kbr4/svevijesti/internal/model" + "gitlab.com/kbr4/svevijesti/internal/scraper" +) + +func main() { + store, err := database.Connect() + if err != nil { + panic(err) + } + + defer store.Close() + go candidateChecker() + go scraper.CrawlSrpskainfo() + go scraper.CrawlKlix() + go scraper.CrawlBljesak() + go scraper.CrawlAvaz() + + article := model.ScrapedArticle{} + + for { + select { + case article = <-scraper.KlixArticles: + if article.Title == model.Terminator { + scraper.KlixArticles = nil + } + case article = <-scraper.SrpskainfoArticles: + if article.Title == model.Terminator { + scraper.SrpskainfoArticles = nil + } + case article = <-scraper.BljesakArticles: + if article.Title == model.Terminator { + scraper.BljesakArticles = nil + } + case article = <-scraper.AvazArticles: + if article.Title == model.Terminator { + scraper.AvazArticles = nil + } + + } + + if article.Title != model.Terminator { + fmt.Println("Saving ", article.OriginalUrl) + err = database.InsertArticle(store, article) + if err, ok := err.(*pq.Error); ok { + if err.Code.Name() != "unique_violation" { + panic(err) + } else { + fmt.Println("Skipping: ", article.OriginalUrl) + fmt.Println("Title ", article.Title) + fmt.Println("Error ", err) + } + } + } + + if scraper.KlixArticles == nil && + scraper.SrpskainfoArticles == nil && + scraper.AvazArticles == nil && + scraper.BljesakCandidates == nil { + break + } + } +} + +func candidateChecker() { + store, err := database.Connect() + if err != nil { + panic(err) + } + defer store.Close() + + for { + select { + case url := <-scraper.KlixCandidates: + if url == model.Terminator { + scraper.KlixCandidates = nil + } else { + if !database.IsSaved(store, url) { + scraper.KlixApprovedSites <- url + } + } + + case url := <-scraper.SrpskainfoCandidates: + if url == model.Terminator { + scraper.SrpskainfoCandidates = nil + } else { + if !database.IsSaved(store, url) { + scraper.SrpskainfoApprovedSites <- url + } + } + + case url := <-scraper.BljesakCandidates: + if url == model.Terminator { + scraper.BljesakCandidates = nil + } else { + if !database.IsSaved(store, url) { + scraper.BljesakApprovedSites <- url + } + } + + case url := <-scraper.AvazCandidates: + if url == model.Terminator { + scraper.AvazCandidates = nil + } else { + if !database.IsSaved(store, url) { + scraper.AvazApprovedSites <- url + } + } + + } + + if scraper.KlixCandidates == nil && + scraper.SrpskainfoCandidates == nil && + scraper.AvazCandidates == nil && + scraper.BljesakCandidates == nil { + break + } + + } + +} diff --git a/cmd/web/web.go b/cmd/web/web.go new file mode 100644 index 0000000..b2b8c59 --- /dev/null +++ b/cmd/web/web.go @@ -0,0 +1,23 @@ +package main + +import ( + "gitlab.com/kbr4/svevijesti/internal/server" + "log" + "net/http" + "time" +) + +func main() { + r := server.CreateRoutes() + http.Handle("/", r) + + srv := &http.Server{ + Handler: r, + Addr: "127.0.0.1:8080", + // Good practice: enforce timeouts for servers you create! + WriteTimeout: 15 * time.Second, + ReadTimeout: 15 * time.Second, + } + + log.Fatal(srv.ListenAndServe()) +} diff --git a/db/development.conf b/db/development.conf new file mode 100644 index 0000000..a360b91 --- /dev/null +++ b/db/development.conf @@ -0,0 +1 @@ +postgresql://svevijesti:salmonela%20pljusti%20221%20hamo@localhost:5432/svevijestiweb?ssl_mode=disabled diff --git a/db/migrations/20220206054902_create_articles.down.sql b/db/migrations/20220206054902_create_articles.down.sql new file mode 100644 index 0000000..5f6bacd --- /dev/null +++ b/db/migrations/20220206054902_create_articles.down.sql @@ -0,0 +1 @@ +DROP TABLE articles; diff --git a/db/migrations/20220206054902_create_articles.up.sql b/db/migrations/20220206054902_create_articles.up.sql new file mode 100644 index 0000000..525d7e7 --- /dev/null +++ b/db/migrations/20220206054902_create_articles.up.sql @@ -0,0 +1,16 @@ +BEGIN; + +CREATE TABLE "articles" ( + "id" bigint GENERATED ALWAYS AS IDENTITY, + "title" text NOT NULL UNIQUE, + "content" text NOT NULL, + "slug" text NOT NULL UNIQUE, + "created_at" timestamptz DEFAULT NOW() NOT NULL, + "original_url" text NOT NULL UNIQUE, + "source_id" int NOT NULL, + CONSTRAINT "articles_pk" PRIMARY KEY ("id") +) WITH ( + OIDS=FALSE +); + +COMMIT; diff --git a/db/migrations/20220215164610_drop_title_constraint.down.sql b/db/migrations/20220215164610_drop_title_constraint.down.sql new file mode 100644 index 0000000..0a7ac76 --- /dev/null +++ b/db/migrations/20220215164610_drop_title_constraint.down.sql @@ -0,0 +1,2 @@ +ALTER TABLE articles +ADD CONSTRAINT articles_title_key UNIQUE (title); diff --git a/db/migrations/20220215164610_drop_title_constraint.up.sql b/db/migrations/20220215164610_drop_title_constraint.up.sql new file mode 100644 index 0000000..0610b42 --- /dev/null +++ b/db/migrations/20220215164610_drop_title_constraint.up.sql @@ -0,0 +1,2 @@ +ALTER TABLE articles +DROP CONSTRAINT articles_title_key; diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..a5b4e5d --- /dev/null +++ b/go.mod @@ -0,0 +1,25 @@ +module gitlab.com/kbr4/svevijesti + +go 1.17 + +require ( + github.com/PuerkitoBio/goquery v1.8.0 // indirect + github.com/andybalholm/cascadia v1.3.1 // indirect + github.com/antchfx/htmlquery v1.2.4 // indirect + github.com/antchfx/xmlquery v1.3.9 // indirect + github.com/antchfx/xpath v1.2.0 // indirect + github.com/gobwas/glob v0.2.3 // indirect + github.com/gocolly/colly v1.2.0 // indirect + github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect + github.com/golang/protobuf v1.3.1 // indirect + github.com/gorilla/mux v1.8.0 // indirect + github.com/gosimple/slug v1.12.0 // indirect + github.com/gosimple/unidecode v1.0.1 // indirect + github.com/kennygrant/sanitize v1.2.4 // indirect + github.com/lib/pq v1.10.4 // indirect + github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect + github.com/temoto/robotstxt v1.1.2 // indirect + golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd // indirect + golang.org/x/text v0.3.7 // indirect + google.golang.org/appengine v1.6.7 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..b68be6c --- /dev/null +++ b/go.sum @@ -0,0 +1,63 @@ +github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U= +github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI= +github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= +github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= +github.com/antchfx/htmlquery v1.2.4 h1:qLteofCMe/KGovBI6SQgmou2QNyedFUW+pE+BpeZ494= +github.com/antchfx/htmlquery v1.2.4/go.mod h1:2xO6iu3EVWs7R2JYqBbp8YzG50gj/ofqs5/0VZoDZLc= +github.com/antchfx/xmlquery v1.3.9 h1:Y+zyMdiUZ4fasTQTkDb3DflOXP7+obcYEh80SISBmnQ= +github.com/antchfx/xmlquery v1.3.9/go.mod h1:wojC/BxjEkjJt6dPiAqUzoXO5nIMWtxHS8PD8TmN4ks= +github.com/antchfx/xpath v1.2.0 h1:mbwv7co+x0RwgeGAOHdrKy89GvHaGvxxBtPK0uF9Zr8= +github.com/antchfx/xpath v1.2.0/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= +github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= +github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= +github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= +github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= +github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI= +github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= +github.com/gosimple/slug v1.12.0 h1:xzuhj7G7cGtd34NXnW/yF0l+AGNfWqwgh/IXgFy7dnc= +github.com/gosimple/slug v1.12.0/go.mod h1:UiRaFH+GEilHstLUmcBgWcI42viBN7mAb818JrYOeFQ= +github.com/gosimple/unidecode v1.0.1 h1:hZzFTMMqSswvf0LBJZCZgThIZrpDHFXux9KeGmn6T/o= +github.com/gosimple/unidecode v1.0.1/go.mod h1:CP0Cr1Y1kogOtx0bJblKzsVWrqYaqfNOnHzpgWw4Awc= +github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= +github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= +github.com/lib/pq v1.10.4 h1:SO9z7FRPzA03QhHKJrH5BXA6HU1rS4V2nIVrrNC1iYk= +github.com/lib/pq v1.10.4/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= +github.com/mattn/go-sqlite3 v1.14.11 h1:gt+cp9c0XGqe9S/wAHTL3n/7MqY+siPWgWJgqdsFrzQ= +github.com/mattn/go-sqlite3 v1.14.11/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= +github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd h1:O7DYs+zxREGLKzKoMQrtrEacpb0ZVXA5rIwylE2Xchk= +golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= +google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= diff --git a/init/starenovine.service b/init/starenovine.service new file mode 100644 index 0000000..c77f34a --- /dev/null +++ b/init/starenovine.service @@ -0,0 +1,17 @@ +[Unit] + +Description=Stare Novine Web +After=network-online.target +Wants=network-online.target systemd-networkd-wait-online.service + +StartLimitIntervalSec=500 +StartLimitBurst=5 + +[Service] +Restart=on-failure +RestartSec=5s +WorkingDirectory=/opt/starenovine/ +ExecStart=/opt/starenovine/server + +[Install] +WantedBy=multi-user.target diff --git a/internal/database/articles.go b/internal/database/articles.go new file mode 100644 index 0000000..c85aeda --- /dev/null +++ b/internal/database/articles.go @@ -0,0 +1,188 @@ +package database + +import ( + "fmt" + _ "github.com/lib/pq" + "gitlab.com/kbr4/svevijesti/internal/model" + "html/template" + "math" + "strings" + "time" +) + +func InsertArticle(store *Store, article model.ScrapedArticle) (err error) { + query := ` + INSERT INTO articles + (title, content, slug, original_url, source_id) + VALUES + ($1,$2,$3,$4,$5);` + + _, err = store.Exec(query, article.Title, article.Content, article.Slug, article.OriginalUrl, article.SourceId) + if err != nil { + return err + } + + return nil +} + +func IsSaved(store *Store, url string) bool { + + exists := false + query, err := store.Prepare(` + select exists(select 1 from articles where original_url = $1); + `) + if err != nil { + panic(err) + } + defer query.Close() + + row := query.QueryRow(url) + err = row.Scan(&exists) + if err != nil { + panic(err) + } + return exists +} + +func ArticlesForDay(store *Store, day time.Time) (articles []model.DisplayArticle, err error) { + + result := []model.DisplayArticle{} + query, err := store.Prepare(` + select id,title, content, slug, original_url, source_id, created_at from articles where created_at > $1 and created_at < $2 and LENGTH(content) > 10 order by id desc; + `) + if err != nil { + return result, err + } + defer query.Close() + + tomorrow := day.AddDate(0, 0, 1) + todayDate := day.Format("2006-01-02") + tomorrowDate := tomorrow.Format("2006-01-02") + + rows, err := query.Query(todayDate, tomorrowDate) + if err != nil { + return result, err + } + defer rows.Close() + + for rows.Next() { + r := model.DisplayArticle{} + err = rows.Scan(&r.ID, &r.Title, &r.Content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt) + if err != nil { + return result, err + } + + ago := time.Now().Sub(r.CreatedAt) + hours := ago.Hours() + + if hours < 1 { + r.FormatedCreatedAt = fmt.Sprintf("Prije %d minuta.", int(math.Floor(ago.Minutes()))) + + } else if hours > 24 { + r.FormatedCreatedAt = r.CreatedAt.Format("02.01.2006. 15:04:05") + } else { + r.FormatedCreatedAt = fmt.Sprintf("Prije %d sati.", int(math.Floor(hours))) + } + r.SourceName = model.SourceName(r.SourceId) + + result = append(result, r) + } + + return result, nil +} + +func ArticleByID(store *Store, ID int, slug string) (article model.DisplayArticle, err error) { + + result := model.DisplayArticle{} + query, err := store.Prepare(` + select id,title, content, slug, original_url, source_id, created_at from articles where id = $1 and slug = $2; + `) + if err != nil { + return result, err + } + defer query.Close() + + row := query.QueryRow(ID, slug) + if err != nil { + return result, err + } + + r := model.DisplayArticle{} + content := "" + err = row.Scan(&r.ID, &r.Title, &content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt) + if err != nil { + return result, err + } + + ago := time.Now().Sub(r.CreatedAt) + hours := ago.Hours() + + r.Content = template.HTML(strings.Replace(content, "\n", "
\n", -1)) + + if hours < 1 { + r.FormatedCreatedAt = fmt.Sprintf("Prije %d minuta.", int(math.Floor(ago.Minutes()))) + + } else if hours > 24 { + r.FormatedCreatedAt = r.CreatedAt.Format("02.01.2006. 15:04:05") + } else { + r.FormatedCreatedAt = fmt.Sprintf("Prije %d sati.", int(math.Floor(hours))) + } + r.SourceName = model.SourceName(r.SourceId) + + result = r + + return result, nil +} + +func PreviousAndNextArticleUrlByID(store *Store, ID int) (nextUrl string, previousUrl string, err error) { + + nextResult, previousResult := "#", "#" + query, err := store.Prepare(` + select id,title, content, slug, original_url, source_id, created_at from articles where id < $1 and id > $2 order by id desc limit 1; + `) + if err != nil { + fmt.Println("Err 1:", err) + return nextResult, previousResult, err + } + defer query.Close() + + row := query.QueryRow(ID, 0) + if err != nil { + fmt.Println("Err 2:", err) + return nextResult, previousResult, err + } + + r := model.DisplayArticle{} + content := "" + err = row.Scan(&r.ID, &r.Title, &content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt) + if err != nil { + return nextResult, previousResult, err + } + + previousResult = fmt.Sprintf("/%d/%s", r.ID, r.Slug) + + query2, err := store.Prepare(` + select id,title, content, slug, original_url, source_id, created_at from articles where id < $1 and id > $2 order by id asc limit 1; + `) + if err != nil { + fmt.Println("Err 1:", err) + return nextResult, previousResult, err + } + defer query2.Close() + + row = query2.QueryRow(ID+1000, ID) + if err != nil { + fmt.Println("Err 3:", err) + return nextResult, previousResult, err + } + + content = "" + err = row.Scan(&r.ID, &r.Title, &content, &r.Slug, &r.OriginalUrl, &r.SourceId, &r.CreatedAt) + if err != nil { + fmt.Println("Err 4:", err) + return nextResult, previousResult, err + } + nextResult = fmt.Sprintf("/%d/%s", r.ID, r.Slug) + + return nextResult, previousResult, nil +} diff --git a/internal/database/database.go b/internal/database/database.go new file mode 100644 index 0000000..f280b0a --- /dev/null +++ b/internal/database/database.go @@ -0,0 +1,25 @@ +package database + +import ( + "database/sql" + "fmt" + _ "github.com/lib/pq" +) + +const ( + host = "localhost" + port = 5432 + user = "svevijesti" + password = "salmonela pljusti 221 hamo" + dbname = "svevijestiweb" +) + +type Store = sql.DB + +func Connect() (*Store, error) { + psqlInfo := fmt.Sprintf("host=%s port=%d user=%s "+ + "password='%s' dbname=%s sslmode=disable", + host, port, user, password, dbname) + db, err := sql.Open("postgres", psqlInfo) + return db, err +} diff --git a/internal/model/model.go b/internal/model/model.go new file mode 100644 index 0000000..c085bdf --- /dev/null +++ b/internal/model/model.go @@ -0,0 +1,51 @@ +package model + +import ( + "html/template" + "time" +) + +type ScrapedArticle struct { + Title string + Content string + Slug string + OriginalUrl string + SourceId int +} + +type DisplayArticle struct { + ID int + Title string + Content template.HTML + Slug string + OriginalUrl string + SourceId int + CreatedAt time.Time + FormatedCreatedAt string + SourceName string +} + +const ( + KlixSource = 1 + SrpskainfoSource = 2 + BljesakSource = 3 + AvazSource = 4 +) + +func SourceName(sourceId int) string { + switch sourceId { + case KlixSource: + return "klix" + case SrpskainfoSource: + return "srpskainfo" + case BljesakSource: + return "bljesak" + case AvazSource: + return "avaz" + } + return "starenovine" +} + +const ( + Terminator = "TERMINATED" +) diff --git a/internal/scraper/avaz.go b/internal/scraper/avaz.go new file mode 100644 index 0000000..906fb80 --- /dev/null +++ b/internal/scraper/avaz.go @@ -0,0 +1,104 @@ +package scraper + +import ( + "fmt" + "github.com/gocolly/colly" + "github.com/gosimple/slug" + "gitlab.com/kbr4/svevijesti/internal/model" + "math/rand" + "regexp" + "strings" + "time" +) + +var AvazArticles = make(chan model.ScrapedArticle) +var AvazCandidates = make(chan string) +var AvazApprovedSites = make(chan string, 2) + +func CrawlAvaz() { + + crHomePage := colly.NewCollector(colly.AllowedDomains("avaz.ba")) + crArticlePage := colly.NewCollector(colly.AllowedDomains("avaz.ba")) + + setupAvazArticlePageCrawler(crArticlePage) + setupAvazHomepageCrawler(crHomePage, crArticlePage) + + go visitAvazApprovedPages(crArticlePage) +} + +func visitAvazApprovedPages(crArticlePage *colly.Collector) { + fmt.Println("Consuming sites!") + for url := range AvazApprovedSites { + fmt.Println("Visiting: ", url) + crArticlePage.Visit(url) + } +} + +func setupAvazHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) { + + articleUrlR, _ := regexp.Compile("/\\d\\d+/([a-z0-9-]+)") + articleBlacklist, _ := regexp.Compile("(english)") + crHomePage.OnHTML("a", func(e *colly.HTMLElement) { + url := e.Attr("href") + completeUrl := url + if articleUrlR.MatchString(url) && !articleBlacklist.MatchString(url) { + AvazCandidates <- completeUrl + } + }) + + crHomePage.OnScraped(func(_ *colly.Response) { + time.Sleep(5 * time.Second) + terminating := model.ScrapedArticle{} + terminating.Title = model.Terminator + AvazArticles <- terminating + AvazApprovedSites <- model.Terminator + AvazCandidates <- model.Terminator + }) + + crHomePage.OnError(func(_ *colly.Response, _ error) { + time.Sleep(5 * time.Second) + terminating := model.ScrapedArticle{} + terminating.Title = model.Terminator + AvazArticles <- terminating + AvazApprovedSites <- model.Terminator + AvazCandidates <- model.Terminator + }) + + go crHomePage.Visit("https://avaz.ba") +} + +func setupAvazArticlePageCrawler(crArticlePage *colly.Collector) { + crArticlePage.OnHTML("html", func(e *colly.HTMLElement) { + + url := e.Request.URL.String() + + title := "" + e.ForEachWithBreak("h1.title, h3.title", func(_ int, el *colly.HTMLElement) bool { + title = el.Text + return false + }) + + text := "" + + e.ForEach("p.podtitle, div.artikal-text", func(_ int, el *colly.HTMLElement) { + text += extractJustText(el.DOM) + }) + + article := model.ScrapedArticle{} + + trimmedText := strings.TrimSpace(text) + article.OriginalUrl = url + article.Title = title + article.Content = trimmedText + article.SourceId = model.AvazSource + slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title) + article.Slug = slug.Make(slugBase) + + AvazArticles <- article + }) + + crArticlePage.OnError(func(_ *colly.Response, err error) { + fmt.Println("Problem crawling!", err) + }) + +} diff --git a/internal/scraper/blijesak.go b/internal/scraper/blijesak.go new file mode 100644 index 0000000..263d2c5 --- /dev/null +++ b/internal/scraper/blijesak.go @@ -0,0 +1,104 @@ +package scraper + +import ( + "fmt" + "github.com/gocolly/colly" + "github.com/gosimple/slug" + "gitlab.com/kbr4/svevijesti/internal/model" + "math/rand" + "regexp" + "strings" + "time" +) + +var BljesakArticles = make(chan model.ScrapedArticle) +var BljesakCandidates = make(chan string) +var BljesakApprovedSites = make(chan string, 2) + +func CrawlBljesak() { + + crHomePage := colly.NewCollector(colly.AllowedDomains("bljesak.info")) + crArticlePage := colly.NewCollector(colly.AllowedDomains("bljesak.info")) + + setupBljesakArticlePageCrawler(crArticlePage) + setupBljesakHomepageCrawler(crHomePage, crArticlePage) + + go visitBljesakApprovedPages(crArticlePage) +} + +func visitBljesakApprovedPages(crArticlePage *colly.Collector) { + fmt.Println("Consuming sites!") + for url := range BljesakApprovedSites { + fmt.Println("Visiting: ", url) + crArticlePage.Visit(url) + } +} + +func setupBljesakHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) { + + articleUrlR, _ := regexp.Compile("\\d\\d+$") + articleBlacklist, _ := regexp.Compile("(info-vodic|foto-data)") + crHomePage.OnHTML("a", func(e *colly.HTMLElement) { + url := e.Attr("href") + completeUrl := url + if articleUrlR.MatchString(url) && !articleBlacklist.MatchString(url) { + BljesakCandidates <- completeUrl + } + }) + + crHomePage.OnScraped(func(_ *colly.Response) { + time.Sleep(5 * time.Second) + terminating := model.ScrapedArticle{} + terminating.Title = model.Terminator + BljesakArticles <- terminating + BljesakApprovedSites <- model.Terminator + BljesakCandidates <- model.Terminator + }) + + crHomePage.OnError(func(_ *colly.Response, _ error) { + time.Sleep(5 * time.Second) + terminating := model.ScrapedArticle{} + terminating.Title = model.Terminator + BljesakArticles <- terminating + BljesakApprovedSites <- model.Terminator + BljesakCandidates <- model.Terminator + }) + + go crHomePage.Visit("https://bljesak.info") +} + +func setupBljesakArticlePageCrawler(crArticlePage *colly.Collector) { + crArticlePage.OnHTML("html", func(e *colly.HTMLElement) { + + url := e.Request.URL.String() + + title := "" + e.ForEachWithBreak("h1.title, h3.title", func(_ int, el *colly.HTMLElement) bool { + title = el.Text + return false + }) + + text := "" + + e.ForEach("div.intro, div#infiniteLoadBreakpoint", func(_ int, el *colly.HTMLElement) { + text += extractJustText(el.DOM) + }) + + article := model.ScrapedArticle{} + + trimmedText := strings.TrimSpace(text) + article.OriginalUrl = url + article.Title = title + article.Content = trimmedText + article.SourceId = model.BljesakSource + slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title) + article.Slug = slug.Make(slugBase) + + BljesakArticles <- article + }) + + crArticlePage.OnError(func(_ *colly.Response, err error) { + fmt.Println("Problem crawling!", err) + }) + +} diff --git a/internal/scraper/klix.go b/internal/scraper/klix.go new file mode 100644 index 0000000..4edc616 --- /dev/null +++ b/internal/scraper/klix.go @@ -0,0 +1,103 @@ +package scraper + +import ( + "fmt" + "github.com/gocolly/colly" + "github.com/gosimple/slug" + "gitlab.com/kbr4/svevijesti/internal/model" + "math/rand" + "regexp" + "strings" + "time" +) + +var KlixArticles = make(chan model.ScrapedArticle) +var KlixCandidates = make(chan string) +var KlixApprovedSites = make(chan string, 2) + +func CrawlKlix() { + + crHomePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba")) + crArticlePage := colly.NewCollector(colly.AllowedDomains("www.klix.ba")) + + setupKlArticlePageCrawler(crArticlePage) + setupKlHomepageCrawler(crHomePage, crArticlePage) + + go visitApprovedPages(crArticlePage) +} + +func visitApprovedPages(crArticlePage *colly.Collector) { + fmt.Println("Consuming sites!") + for url := range KlixApprovedSites { + fmt.Println("Visiting: ", url) + crArticlePage.Visit(url) + } +} + +func setupKlHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) { + + articleUrlR, _ := regexp.Compile("\\d\\d+$") + crHomePage.OnHTML(".container a", func(e *colly.HTMLElement) { + url := e.Attr("href") + completeUrl := "https://www.klix.ba" + url + if articleUrlR.MatchString(url) { + KlixCandidates <- completeUrl + } + }) + + crHomePage.OnScraped(func(_ *colly.Response) { + time.Sleep(5 * time.Second) + terminating := model.ScrapedArticle{} + terminating.Title = model.Terminator + KlixArticles <- terminating + KlixApprovedSites <- model.Terminator + KlixCandidates <- model.Terminator + }) + + crHomePage.OnError(func(_ *colly.Response, _ error) { + time.Sleep(5 * time.Second) + terminating := model.ScrapedArticle{} + terminating.Title = model.Terminator + KlixArticles <- terminating + KlixApprovedSites <- model.Terminator + KlixCandidates <- model.Terminator + }) + + go crHomePage.Visit("https://www.klix.ba") +} + +func setupKlArticlePageCrawler(crArticlePage *colly.Collector) { + crArticlePage.OnHTML("html", func(e *colly.HTMLElement) { + + url := e.Request.URL.String() + + title := "" + e.ForEachWithBreak("title", func(_ int, el *colly.HTMLElement) bool { + title = el.Text + return false + }) + + text := "" + + e.ForEach("div#text, p.lead", func(_ int, el *colly.HTMLElement) { + text += extractJustText(el.DOM) + }) + + article := model.ScrapedArticle{} + + trimmedText := strings.TrimSpace(text) + article.OriginalUrl = url + article.Title = title + article.Content = trimmedText + article.SourceId = model.KlixSource + slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title) + article.Slug = slug.Make(slugBase) + + KlixArticles <- article + }) + + crArticlePage.OnError(func(_ *colly.Response, _ error) { + fmt.Println("Problem crawling!") + }) + +} diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go new file mode 100644 index 0000000..deaa6a2 --- /dev/null +++ b/internal/scraper/scraper.go @@ -0,0 +1,26 @@ +package scraper + +import ( + "github.com/PuerkitoBio/goquery" +) + +func extractJustText(el *goquery.Selection) string { + textPart := "" + htmlPart, _ := el.Html() + if len(el.Nodes) == 0 { + return "" + } + + //fmt.Println("Checking: ", htmlPart, "Duzina: ", strconv.Itoa(len(el.Nodes)), " Type je ", el.Nodes[0].Type, " jednakost ", el.Text() == htmlPart) + if el.Text() == htmlPart { + return el.Text() + "\n" + } + + el.Children().Each(func(_ int, el2 *goquery.Selection) { + if el2.Is("div, p, span, a") { + textPart += extractJustText(el2) + } + }) + + return textPart +} diff --git a/internal/scraper/srpskainfo.go b/internal/scraper/srpskainfo.go new file mode 100644 index 0000000..71bec8f --- /dev/null +++ b/internal/scraper/srpskainfo.go @@ -0,0 +1,103 @@ +package scraper + +import ( + "fmt" + "github.com/gocolly/colly" + "github.com/gosimple/slug" + "gitlab.com/kbr4/svevijesti/internal/model" + "math/rand" + "regexp" + "strings" + "time" +) + +var SrpskainfoArticles = make(chan model.ScrapedArticle) +var SrpskainfoCandidates = make(chan string) +var SrpskainfoApprovedSites = make(chan string, 2) + +func CrawlSrpskainfo() { + + crHomePage := colly.NewCollector(colly.AllowedDomains("srpskainfo.com")) + crArticlePage := colly.NewCollector(colly.AllowedDomains("srpskainfo.com")) + + setupSiArticlePageCrawler(crArticlePage) + setupSiHomepageCrawler(crHomePage, crArticlePage) + + go visitSiApprovedPages(crArticlePage) +} + +func visitSiApprovedPages(crArticlePage *colly.Collector) { + fmt.Println("Consuming sites!") + for url := range SrpskainfoApprovedSites { + fmt.Println("Visiting: ", url) + crArticlePage.Visit(url) + } +} + +func setupSiHomepageCrawler(crHomePage *colly.Collector, crArticlePage *colly.Collector) { + + crHomePage.OnHTML("a", func(e *colly.HTMLElement) { + articleUrlR, _ := regexp.Compile("([A-Za-z0-9]+-){3,}([A-Za-z0-9]+)/$") + url := e.Attr("href") + completeUrl := url + if articleUrlR.MatchString(url) { + SrpskainfoCandidates <- completeUrl + } + }) + + crHomePage.OnScraped(func(_ *colly.Response) { + time.Sleep(5 * time.Second) + terminating := model.ScrapedArticle{} + terminating.Title = model.Terminator + SrpskainfoArticles <- terminating + SrpskainfoApprovedSites <- model.Terminator + SrpskainfoCandidates <- model.Terminator + }) + + crHomePage.OnError(func(_ *colly.Response, _ error) { + time.Sleep(5 * time.Second) + terminating := model.ScrapedArticle{} + terminating.Title = model.Terminator + SrpskainfoArticles <- terminating + SrpskainfoApprovedSites <- model.Terminator + SrpskainfoCandidates <- model.Terminator + }) + + go crHomePage.Visit("https://srpskainfo.com") +} + +func setupSiArticlePageCrawler(crArticlePage *colly.Collector) { + crArticlePage.OnHTML("html", func(e *colly.HTMLElement) { + + url := e.Request.URL.String() + + title := "" + e.ForEachWithBreak("h1", func(_ int, el *colly.HTMLElement) bool { + title = el.Text + return false + }) + + text := "" + + e.ForEach("div.article__top-content, div.article__content", func(_ int, el *colly.HTMLElement) { + text += extractJustText(el.DOM) + }) + + article := model.ScrapedArticle{} + + trimmedText := strings.TrimSpace(text) + article.OriginalUrl = url + article.Title = title + article.Content = trimmedText + article.SourceId = model.SrpskainfoSource + slugBase := fmt.Sprintf("%d %d %s", article.SourceId, rand.Intn(1000), title) + article.Slug = slug.Make(slugBase) + + SrpskainfoArticles <- article + }) + + crArticlePage.OnError(func(_ *colly.Response, _ error) { + fmt.Println("Problem crawling!") + }) + +} diff --git a/internal/server/articles.go b/internal/server/articles.go new file mode 100644 index 0000000..d64ffab --- /dev/null +++ b/internal/server/articles.go @@ -0,0 +1,110 @@ +package server + +import ( + "fmt" + "github.com/gorilla/mux" + "gitlab.com/kbr4/svevijesti/internal/database" + "net/http" + "strconv" + "time" +) + +func rootHandler(wr http.ResponseWriter, req *http.Request) { + title := "Pocetna" + store, err := database.Connect() + if err != nil { + http.Error(wr, err.Error(), http.StatusInternalServerError) + } + defer store.Close() + + articles, err := database.ArticlesForDay(store, time.Now()) + if err != nil { + http.Error(wr, err.Error(), http.StatusInternalServerError) + } + + dayBefore := "/dan/" + time.Now().Add(-24*time.Hour).Format("2006-01-02") + + data := map[string]interface{}{ + "title": title, + "articles": articles, + "previous": dayBefore, + "next": "/", + } + + err = templates.ExecuteTemplate(wr, "homeHTML", data) + if err != nil { + http.Error(wr, err.Error(), http.StatusInternalServerError) + } +} + +func dailyArticlesHandler(wr http.ResponseWriter, req *http.Request) { + vars := mux.Vars(req) + day, err := time.Parse("2006-01-02", vars["date"]) + if err != nil { + http.Error(wr, err.Error(), http.StatusNotFound) + } + dayBefore := "/dan/" + day.Add(-24*time.Hour).Format("2006-01-02") + dayAfter := "/dan/" + day.Add(24*time.Hour).Format("2006-01-02") + + if day.Add(24*time.Hour).Format("2006-01-02") == time.Now().Format("2006-01-02") { + dayAfter = "/" + } + + title := fmt.Sprintf("Stare novine na dan %s", day.Format("2006-01-02")) + store, err := database.Connect() + if err != nil { + http.Error(wr, err.Error(), http.StatusInternalServerError) + } + defer store.Close() + + articles, err := database.ArticlesForDay(store, day) + if err != nil { + http.Error(wr, err.Error(), http.StatusInternalServerError) + } + + data := map[string]interface{}{ + "title": title, + "articles": articles, + "previous": dayBefore, + "next": dayAfter, + } + + err = templates.ExecuteTemplate(wr, "homeHTML", data) + if err != nil { + http.Error(wr, err.Error(), http.StatusInternalServerError) + } +} + +func articleHandler(wr http.ResponseWriter, req *http.Request) { + store, err := database.Connect() + if err != nil { + http.Error(wr, err.Error(), http.StatusInternalServerError) + } + defer store.Close() + + vars := mux.Vars(req) + articleID, err := strconv.Atoi(vars["id"]) + if err != nil { + articleID = -1 + } + articleSlug := vars["slug"] + article, err := database.ArticleByID(store, articleID, articleSlug) + if err != nil { + http.Error(wr, err.Error(), http.StatusNotFound) + } + + next, previous, _ := database.PreviousAndNextArticleUrlByID(store, articleID) + + title := article.Title + data := map[string]interface{}{ + "title": title, + "article": article, + "previous": previous, + "next": next, + } + + err = templates.ExecuteTemplate(wr, "articleHTML", data) + if err != nil { + http.Error(wr, err.Error(), http.StatusInternalServerError) + } +} diff --git a/internal/server/server.go b/internal/server/server.go new file mode 100644 index 0000000..9d7b07b --- /dev/null +++ b/internal/server/server.go @@ -0,0 +1,46 @@ +package server + +import ( + "fmt" + "github.com/gorilla/mux" + "html/template" + "io/ioutil" + "path/filepath" + "strings" +) + +var tPath = "./web/tpl/" +var dPath = "./web/data/" + +var templateDirs = []string{"./web/tpl", "./web/data"} +var templates *template.Template + +func getTemplates() (templates *template.Template, err error) { + var allFiles []string + for _, dir := range templateDirs { + files2, _ := ioutil.ReadDir(dir) + for _, file := range files2 { + filename := file.Name() + if strings.HasSuffix(filename, ".html") { + filePath := filepath.Join(dir, filename) + fmt.Println("Template found: ", filePath) + allFiles = append(allFiles, filePath) + } + } + } + + templates, err = template.New("").ParseFiles(allFiles...) + return +} + +func init() { + templates, _ = getTemplates() +} + +func CreateRoutes() *mux.Router { + r := mux.NewRouter() + r.HandleFunc("/dan/{date}", dailyArticlesHandler) + r.HandleFunc("/{id:[0-9]+}/{slug}", articleHandler) + r.HandleFunc("/", rootHandler) + return r +} diff --git a/pyth/avaz.py b/pyth/avaz.py new file mode 100644 index 0000000..898b5ea --- /dev/null +++ b/pyth/avaz.py @@ -0,0 +1,27 @@ +import requests +from bs4 import BeautifulSoup + +def getNews(url): + response = requests.get(url) + + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'html.parser') + + articles = soup.find_all('article', class_='news__item') + + for index, article in enumerate(articles, start=1): + title = article.find('h2').text.strip() + content = article.find('p').text.strip() + category = article.find('span').text.strip() + + print(f"{index}. Title: {title}") + print(f" Content: {content}") + print(f" Category: {category}") + print('****************************') + else: + print(f"Error. Status code: {response.status_code}") + +if __name__ == "__main__": + pUrl = 'https://srpskainfo.com/sve-vijesti/' + + getNews(pUrl) diff --git a/pyth/checkforsimilar.py b/pyth/checkforsimilar.py new file mode 100644 index 0000000..7c3875d --- /dev/null +++ b/pyth/checkforsimilar.py @@ -0,0 +1,74 @@ +import psycopg2 +from openai import OpenAI +from datetime import datetime, timedelta, timezone + + +client = OpenAI(api_key='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') +import spacy + + +nlp = spacy.load("en_core_web_sm") +twenty_minutes_ago_utc = datetime.now(timezone.utc) - timedelta(minutes=20) + + + +db_params = { + 'host': 'localhost', + 'port': '5432', + 'database': 'svevijestiweb', + 'user': 'svevijesti', + 'password': 'salmonela pljusti 221 hamo' +} + + +conn = psycopg2.connect(**db_params) +cursor = conn.cursor() + +def convert_text_to_vector(text): + return nlp(text).vector + +def check_similarity_with_gpt3(text1, text2): + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "You are a text similarity assistant."}, + {"role": "user", "content": f"Compare the similarity between the following two texts:\n\nText 1: {text1}\nText 2: {text2}\n\nSimilarity:"} + ] + ) + similarity_score = completion.choices[0].message.content + print("Analiza") + return similarity_score + +cursor.execute("SELECT title FROM articles WHERE articles.created_at < %s", (twenty_minutes_ago_utc,)) +data_from_database = cursor.fetchall() + + +for i in range(len(data_from_database)): + for j in range(i + 1, len(data_from_database)): + text1 = data_from_database[i][0] + text2 = data_from_database[j][0] + + vector1 = convert_text_to_vector(text1) + vector2 = convert_text_to_vector(text2) + + similarity_score = check_similarity_with_gpt3(vector1, vector2 ) + print(similarity_score) + print("T1",text1) + print("T2", text2) + + + similarity_threshold = 0.8 + + if similarity_score > similarity_threshold: + try: + cursor.execute("DELETE FROM articles WHERE content = %s", (text2,)) + conn.commit() + print(f"Deleted rows where title is {text2}") + except Exception as e: + conn.rollback() # Roll back changes if an error occurs + print(f"Error deleting rows: {e}") + + + +cursor.close() +conn.close() diff --git a/pyth/srpskainfo.py b/pyth/srpskainfo.py new file mode 100644 index 0000000..88c73d9 --- /dev/null +++ b/pyth/srpskainfo.py @@ -0,0 +1,34 @@ +import requests +from bs4 import BeautifulSoup + +def getNews(url): + response = requests.get(url) + + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'html.parser') + + articles = soup.find_all('article', class_='news__item') + + for index, article in enumerate(articles, start=1): + title = article.find('h2').text.strip() + content = article.find('p').text.strip() + category = article.find('span').text.strip() + slink = article.find('a') + if slink: + slink = slink.get('href', '') + else: + slink = '' + + + print(f"{index}. Title: {title}") + print(f" Content: {content}") + print(f" Category: {category}") + print(f"Link: {slink}") + print('****************************') + else: + print(f"Error. Status code: {response.status_code}") + +if __name__ == "__main__": + pUrl = 'https://srpskainfo.com/sve-vijesti/' + + getNews(pUrl) diff --git a/scripts/install_server.sh b/scripts/install_server.sh new file mode 100644 index 0000000..33807c2 --- /dev/null +++ b/scripts/install_server.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +sudo systemctl stop starenovine +sudo cp ./server /opt/starenovine/server +sudo cp -R ./web /opt/starenovine/ +sudo killall spider +sudo cp ./spider /opt/starenovine/spider +sudo systemctl start starenovine diff --git a/server b/server new file mode 100755 index 0000000..df4e93d Binary files /dev/null and b/server differ diff --git a/spider b/spider new file mode 100755 index 0000000..714b07c Binary files /dev/null and b/spider differ diff --git a/web/data/articles.html b/web/data/articles.html new file mode 100644 index 0000000..66c0d3c --- /dev/null +++ b/web/data/articles.html @@ -0,0 +1,16 @@ +{{define "articlesHTML"}} +
    +{{range .articles}} +
  1. +
    + + {{.Title}}
    +
    {{.SourceName}} - {{ .FormatedCreatedAt }}
    +
  2. +

    +{{else}} +Nema članaka za izabrani datum. +{{end}} + +
+{{end}} diff --git a/web/data/footer.html b/web/data/footer.html new file mode 100644 index 0000000..b5b225e --- /dev/null +++ b/web/data/footer.html @@ -0,0 +1,13 @@ +{{define "footerHTML"}} + + +{{end}} diff --git a/web/data/head.html b/web/data/head.html new file mode 100644 index 0000000..1d63ae5 --- /dev/null +++ b/web/data/head.html @@ -0,0 +1,56 @@ +{{define "headHTML"}} + + + + + + + + + + + + {{.title}} - stare novine + + + +{{end}} diff --git a/web/data/header.html b/web/data/header.html new file mode 100644 index 0000000..3da02e7 --- /dev/null +++ b/web/data/header.html @@ -0,0 +1,20 @@ +{{define "headerHTML"}} +
+ +
+ +
+{{end}} diff --git a/web/data/single_article.html b/web/data/single_article.html new file mode 100644 index 0000000..5c30b30 --- /dev/null +++ b/web/data/single_article.html @@ -0,0 +1,11 @@ +{{define "singleArticleHTML"}} +{{with .article }} +
+

{{.Title}}

+
{{.SourceName}} - {{ .FormatedCreatedAt }}
+
+ {{.Content}} +
+

+{{end}} +{{end}} diff --git a/web/tpl/article.html b/web/tpl/article.html new file mode 100644 index 0000000..6400b9e --- /dev/null +++ b/web/tpl/article.html @@ -0,0 +1,12 @@ +{{define "articleHTML"}} + +{{template "headHTML" .}} + +{{template "headerHTML" .}} + +{{template "singleArticleHTML" .}} + +{{template "footerHTML" .}} + + +{{end}} diff --git a/web/tpl/dailyArticles.html b/web/tpl/dailyArticles.html new file mode 100644 index 0000000..849ed2f --- /dev/null +++ b/web/tpl/dailyArticles.html @@ -0,0 +1,12 @@ +{{define "homeHTML"}} + +{{template "headHTML" .}} + +{{template "headerHTML" .}} + +{{template "articlesHTML" .}} + +{{template "footerHTML" .}} + + +{{end}} diff --git a/web/tpl/home.html b/web/tpl/home.html new file mode 100644 index 0000000..849ed2f --- /dev/null +++ b/web/tpl/home.html @@ -0,0 +1,12 @@ +{{define "homeHTML"}} + +{{template "headHTML" .}} + +{{template "headerHTML" .}} + +{{template "articlesHTML" .}} + +{{template "footerHTML" .}} + + +{{end}}