Merge branch 'implement-worker-client' into 'master'
Implement worker client See merge request saburly/kiviscraplib!1
This commit was merged in pull request #1.
This commit is contained in:
94
config/config.go
Normal file
94
config/config.go
Normal file
@@ -0,0 +1,94 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"github.com/joho/godotenv"
|
||||
"gitlab.com/saburly/kiviscraplib/structures"
|
||||
"log"
|
||||
"os"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
var WebServerConfig structures.WebServerConfig
|
||||
var WorkerServerConfig structures.WorkerServerConfig
|
||||
var ClientConfig structures.ClientConfig
|
||||
|
||||
var defaultClientConfigValues = make(map[string]string)
|
||||
var defaultServerConfigValues = make(map[string]string)
|
||||
|
||||
func InitServerConfig() {
|
||||
loadEnvVariables()
|
||||
|
||||
initServerConfigDefaultValues()
|
||||
generateServerConfigObject()
|
||||
}
|
||||
|
||||
func InitClientConfig() {
|
||||
loadEnvVariables()
|
||||
|
||||
initClientConfigDefaultValues()
|
||||
generateClientConfigObject()
|
||||
}
|
||||
|
||||
func loadEnvVariables() {
|
||||
err := godotenv.Load()
|
||||
if err != nil {
|
||||
log.Fatal("Unable to load ENV variables")
|
||||
}
|
||||
}
|
||||
|
||||
func generateClientConfigObject() {
|
||||
ClientConfig.ConnectionsCount = getInt("CLIENT_CONNECTIONS_COUNT")
|
||||
ClientConfig.ConnectionTimeout = getInt("CLIENT_CONNECTION_TIMEOUT")
|
||||
ClientConfig.WorkerServerAddress = getString("WORKER_SERVER_ADDRESS")
|
||||
ClientConfig.RequestMessagePrefix = getString("REQUEST_MESSAGE_PREFIX")
|
||||
ClientConfig.ProxyListBaseURL = getString("PROXY_LIST_BASE_URL")
|
||||
}
|
||||
|
||||
func generateServerConfigObject() {
|
||||
WebServerConfig.Address = getString("WEB_SERVER_ADDRESS")
|
||||
WebServerConfig.APIKey = getString("WEB_SERVER_API_KEY")
|
||||
WebServerConfig.Timeout = getInt("WEB_SERVER_TIMEOUT")
|
||||
|
||||
WorkerServerConfig.Address = getString("WORKER_SERVER_ADDRESS")
|
||||
WorkerServerConfig.WorkersCount = getInt("WORKER_SERVER_WORKERS_COUNT")
|
||||
WorkerServerConfig.RequestMessagePrefix = getString("WORKER_SERVER_REQUEST_MESSAGE_PREFIX")
|
||||
}
|
||||
|
||||
func initClientConfigDefaultValues() {
|
||||
defaultClientConfigValues["CLIENT_CONNECTIONS_COUNT"] = "5"
|
||||
defaultClientConfigValues["CLIENT_CONNECTION_TIMEOUT"] = "2"
|
||||
defaultClientConfigValues["WORKER_SERVER_ADDRESS"] = "127.0.0.1:1338"
|
||||
defaultClientConfigValues["REQUEST_MESSAGE_PREFIX"] = "URL "
|
||||
defaultClientConfigValues["PROXY_LIST_BASE_URL"] = "https://www.proxy-list.download/api/v1/get?type="
|
||||
}
|
||||
|
||||
func initServerConfigDefaultValues() {
|
||||
defaultServerConfigValues["WEB_SERVER_ADDRESS"] = "127.0.0.1:1337"
|
||||
defaultServerConfigValues["WEB_SERVER_API_KEY"] = "b8be5a3b639d465039e8fbe7582270a7"
|
||||
defaultServerConfigValues["WEB_SERVER_TIMEOUT"] = "100"
|
||||
|
||||
defaultServerConfigValues["WORKER_SERVER_ADDRESS"] = "127.0.0.1:1338"
|
||||
defaultServerConfigValues["WORKER_SERVER_WORKERS_COUNT"] = "50"
|
||||
defaultServerConfigValues["WORKER_SERVER_REQUEST_MESSAGE_PREFIX"] = "URL "
|
||||
}
|
||||
|
||||
func getInt(key string) int {
|
||||
value := os.Getenv(key)
|
||||
if len(value) == 0 {
|
||||
value = defaultClientConfigValues[key]
|
||||
}
|
||||
|
||||
numericalValue, err := strconv.Atoi(value)
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot convert ENV value for %s to the integer", key)
|
||||
}
|
||||
return numericalValue
|
||||
}
|
||||
|
||||
func getString(key string) string {
|
||||
value := os.Getenv(key)
|
||||
if len(value) == 0 {
|
||||
return defaultClientConfigValues[key]
|
||||
}
|
||||
return value
|
||||
}
|
||||
@@ -15,3 +15,28 @@ type WorkerDescription struct {
|
||||
Ip string
|
||||
Req chan Request
|
||||
}
|
||||
|
||||
type ProxyServer struct {
|
||||
Type string
|
||||
Address string
|
||||
}
|
||||
|
||||
type WebServerConfig struct {
|
||||
Address string
|
||||
APIKey string
|
||||
Timeout int // In seconds
|
||||
}
|
||||
|
||||
type WorkerServerConfig struct {
|
||||
Address string
|
||||
WorkersCount int
|
||||
RequestMessagePrefix string
|
||||
}
|
||||
|
||||
type ClientConfig struct {
|
||||
ConnectionsCount int
|
||||
ConnectionTimeout int // In seconds
|
||||
WorkerServerAddress string
|
||||
RequestMessagePrefix string
|
||||
ProxyListBaseURL string
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package webserver
|
||||
|
||||
import (
|
||||
c "gitlab.com/saburly/kiviscraplib/config"
|
||||
"gitlab.com/saburly/kiviscraplib/structures"
|
||||
"gitlab.com/saburly/kiviscraplib/utils"
|
||||
"log"
|
||||
@@ -13,7 +14,7 @@ var requests chan structures.Request
|
||||
func ServeHTTP(queue chan structures.Request, end chan<- string) {
|
||||
requests = queue
|
||||
http.HandleFunc("/", httpHandler)
|
||||
err := http.ListenAndServe("127.0.0.1:1337", nil)
|
||||
err := http.ListenAndServe(c.WebServerConfig.Address, nil)
|
||||
if err != nil {
|
||||
end <- "http server"
|
||||
log.Fatal(err)
|
||||
@@ -21,7 +22,6 @@ func ServeHTTP(queue chan structures.Request, end chan<- string) {
|
||||
}
|
||||
|
||||
func httpHandler(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
query := r.URL.Query()
|
||||
api_key, present := query["api_key"]
|
||||
if !present || len(api_key) == 0 {
|
||||
@@ -30,7 +30,7 @@ func httpHandler(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
// todo: do more for the authentication / authorization
|
||||
if api_key[0] != "b8be5a3b639d465039e8fbe7582270a7" {
|
||||
if api_key[0] != c.WebServerConfig.APIKey {
|
||||
respondWithError(w, 401, "api_key is wrong")
|
||||
return
|
||||
}
|
||||
@@ -56,10 +56,9 @@ func httpHandler(w http.ResponseWriter, r *http.Request) {
|
||||
log.Printf("request from %s: %s %q", r.RemoteAddr, r.Method, r.URL)
|
||||
log.Printf("waiting for response")
|
||||
|
||||
// todo: put timeout in ENV variable
|
||||
timeout := make(chan bool, 1)
|
||||
go func() {
|
||||
time.Sleep(100 * time.Second)
|
||||
time.Sleep(time.Duration(c.WebServerConfig.Timeout) * time.Second)
|
||||
timeout <- true
|
||||
}()
|
||||
|
||||
|
||||
209
workerclient/workerclient.go
Normal file
209
workerclient/workerclient.go
Normal file
@@ -0,0 +1,209 @@
|
||||
package workerclient
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
b64 "encoding/base64"
|
||||
c "gitlab.com/saburly/kiviscraplib/config"
|
||||
"gitlab.com/saburly/kiviscraplib/structures"
|
||||
"golang.org/x/net/proxy"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"math/rand"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
func StartClientConnections() {
|
||||
rand.Seed(time.Now().Unix())
|
||||
|
||||
numberOfConnections := c.ClientConfig.ConnectionsCount
|
||||
|
||||
for i := 0; i < numberOfConnections; i++ {
|
||||
go startSingleConnection(i)
|
||||
}
|
||||
|
||||
// TODO: Make one initial global proxy list load
|
||||
// TODO: Make goroutine for periodic proxy reload and health check
|
||||
}
|
||||
|
||||
// startSingleConnection starts one single connection waiting for request message from load balancer server
|
||||
func startSingleConnection(connectionId int) {
|
||||
log.Printf("(%d) Starting new client connection\n", connectionId)
|
||||
|
||||
connectionTimeout := c.ClientConfig.ConnectionTimeout
|
||||
|
||||
for {
|
||||
serverAddress := c.ClientConfig.WorkerServerAddress
|
||||
conn, err := net.Dial("tcp", serverAddress)
|
||||
if err != nil {
|
||||
log.Printf("(%d) Cannot connect to the load balancer server on %s : %s", connectionId, serverAddress, err)
|
||||
time.Sleep(time.Duration(connectionTimeout) * time.Second)
|
||||
continue
|
||||
}
|
||||
|
||||
for {
|
||||
encodedRequestMessage, err := bufio.NewReader(conn).ReadString('\n')
|
||||
if err != nil {
|
||||
log.Printf("(%d) Error receiving request from load balancer server : %s\n", connectionId, err)
|
||||
_ = conn.Close()
|
||||
time.Sleep(time.Duration(connectionTimeout) * time.Second)
|
||||
break
|
||||
}
|
||||
|
||||
requestMessageBytes, _ := b64.StdEncoding.DecodeString(strings.TrimSpace(encodedRequestMessage))
|
||||
requestMessage := string(requestMessageBytes)
|
||||
|
||||
log.Printf("(%d) Received new request message : %s", connectionId, requestMessage)
|
||||
|
||||
requestMessagePrefix := c.ClientConfig.RequestMessagePrefix
|
||||
if !strings.HasPrefix(requestMessage, requestMessagePrefix) {
|
||||
log.Printf("(%d) Request message has to start with %s", connectionId, requestMessagePrefix)
|
||||
_ = conn.Close()
|
||||
time.Sleep(time.Duration(connectionTimeout) * time.Second)
|
||||
break
|
||||
}
|
||||
|
||||
urlToFetch := strings.TrimSpace(strings.TrimPrefix(requestMessage, requestMessagePrefix))
|
||||
|
||||
pageBody, err := fetchPage(urlToFetch, connectionId)
|
||||
if err != nil {
|
||||
_ = conn.Close()
|
||||
time.Sleep(time.Duration(connectionTimeout) * time.Second)
|
||||
break
|
||||
}
|
||||
|
||||
pageBodyBase64 := b64.StdEncoding.EncodeToString([]byte(pageBody))
|
||||
|
||||
_, err = conn.Write([]byte(pageBodyBase64 + "\n"))
|
||||
if err != nil {
|
||||
log.Printf("(%d) Cannot write to the load balancer server : %s", connectionId, err)
|
||||
_ = conn.Close()
|
||||
time.Sleep(time.Duration(connectionTimeout) * time.Second)
|
||||
break
|
||||
}
|
||||
|
||||
log.Printf("(%d) Response sent. Waiting for a new job", connectionId)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fetchPage fetches page from desired url using random proxy
|
||||
func fetchPage(url string, connectionId int) (string, error) {
|
||||
// create request
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
if err != nil {
|
||||
log.Printf("(%d) Cannot create GET request to the %s : %s\n", connectionId, url, err)
|
||||
return "", err
|
||||
}
|
||||
|
||||
// create httpClient to execute request
|
||||
httpClient, err := getHttpClient(connectionId)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// use the http client to fetch the page
|
||||
resp, err2 := httpClient.Do(req)
|
||||
if err2 != nil {
|
||||
log.Printf("(%d) [PROXY] Cannot GET page %s : %s\n", connectionId, url, err2)
|
||||
return "", err2
|
||||
}
|
||||
|
||||
// TODO: Handle error
|
||||
defer resp.Body.Close()
|
||||
|
||||
pageBody, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
log.Printf("(%d) [PROXY] Cannot read body of %s : %s\n", connectionId, url, err)
|
||||
return "", err
|
||||
}
|
||||
|
||||
return string(pageBody), nil
|
||||
}
|
||||
|
||||
// getHttpClient creates client with proxy connection; Proxy is selected randomly from list of proxies
|
||||
func getHttpClient(connectionId int) (*http.Client, error) {
|
||||
proxyList := getAllProxies()
|
||||
|
||||
// setup a http client
|
||||
httpTransport := &http.Transport{}
|
||||
httpClient := &http.Client{Transport: httpTransport}
|
||||
|
||||
if len(proxyList) == 0 {
|
||||
log.Printf("(%d) [PROXY] No proxy found, will continue without proxy!\n", connectionId)
|
||||
return httpClient, nil
|
||||
}
|
||||
|
||||
// get random proxy from the list
|
||||
selectedProxy := proxyList[rand.Intn(len(proxyList))]
|
||||
|
||||
if selectedProxy.Type == "https" {
|
||||
proxyUrl, err := url.Parse("http://" + selectedProxy.Address)
|
||||
if err != nil {
|
||||
log.Printf("(%d) [PROXY] Cannot parse proxy address (%s) %s : %s\n",
|
||||
connectionId, selectedProxy.Type, selectedProxy.Address, err)
|
||||
return nil, err
|
||||
}
|
||||
httpTransport.Proxy = http.ProxyURL(proxyUrl)
|
||||
//myClient := &http.Client{Transport: &http.Transport{Proxy: http.ProxyURL(proxyUrl)}}
|
||||
}
|
||||
|
||||
if selectedProxy.Type == "socks5" {
|
||||
dialer, err := proxy.SOCKS5("tcp", selectedProxy.Address, nil, proxy.Direct)
|
||||
if err != nil {
|
||||
log.Printf("(%d) [PROXY] Cannot create connection to the proxy (%s) %s : %s\n",
|
||||
connectionId, selectedProxy.Type, selectedProxy.Address, err)
|
||||
return nil, err
|
||||
}
|
||||
httpTransport.Dial = dialer.Dial
|
||||
}
|
||||
|
||||
log.Printf("(%d) [PROXY] Selected proxy (%s) %s\n", connectionId, selectedProxy.Type, selectedProxy.Address)
|
||||
|
||||
return httpClient, nil
|
||||
}
|
||||
|
||||
// getAllProxies combines all proxy types in one slice
|
||||
func getAllProxies() []structures.ProxyServer {
|
||||
return append(getProxiesList("socks5"), getProxiesList("https")...)
|
||||
}
|
||||
|
||||
// getProxiesList fetches list of proxies of specific type (valid types are : "socks5", "https")
|
||||
func getProxiesList(proxyType string) []structures.ProxyServer {
|
||||
switch proxyType {
|
||||
case "https":
|
||||
case "socks5":
|
||||
break
|
||||
default:
|
||||
log.Printf("%s is not a valid proxy type", proxyType)
|
||||
return []structures.ProxyServer{}
|
||||
}
|
||||
|
||||
proxyListUrl := c.ClientConfig.ProxyListBaseURL + proxyType
|
||||
|
||||
resp, err := http.Get(proxyListUrl)
|
||||
if err != nil {
|
||||
log.Printf("Cannot get list of proxies [%s] : %s", proxyType, err)
|
||||
return []structures.ProxyServer{}
|
||||
}
|
||||
|
||||
defer resp.Body.Close()
|
||||
|
||||
proxyList, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
log.Printf("Cannot read response for a list of proxies [%s] : %s", proxyType, err)
|
||||
return []structures.ProxyServer{}
|
||||
}
|
||||
|
||||
proxyAddresses := strings.Split(strings.TrimSpace(string(proxyList)), "\n")
|
||||
|
||||
var result []structures.ProxyServer
|
||||
for _, addr := range proxyAddresses {
|
||||
result = append(result, structures.ProxyServer{Type: proxyType, Address: strings.TrimSpace(addr)})
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
@@ -2,10 +2,13 @@ package workerserver
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
b64 "encoding/base64"
|
||||
c "gitlab.com/saburly/kiviscraplib/config"
|
||||
"gitlab.com/saburly/kiviscraplib/structures"
|
||||
"log"
|
||||
"math/rand"
|
||||
"net"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
@@ -13,13 +16,12 @@ var requests chan structures.Request
|
||||
var workers chan structures.WorkerDescription
|
||||
|
||||
func ServeWorkers(queue chan structures.Request, end chan<- string) {
|
||||
|
||||
rand.Seed(time.Now().Unix())
|
||||
|
||||
workers = make(chan structures.WorkerDescription, 50) // TODO: move to env var
|
||||
workers = make(chan structures.WorkerDescription, c.WorkerServerConfig.WorkersCount)
|
||||
requests = queue
|
||||
|
||||
listener, err := net.Listen("tcp", "127.0.0.1:1338")
|
||||
listener, err := net.Listen("tcp", c.WorkerServerConfig.Address)
|
||||
if err != nil {
|
||||
log.Fatal("tcp server listener error:", err)
|
||||
end <- "tcp server"
|
||||
@@ -53,11 +55,15 @@ func handleConnection(conn net.Conn) {
|
||||
Req: make(chan structures.Request),
|
||||
}
|
||||
|
||||
log.Printf("New worker client connected from %s\n", clientAddr)
|
||||
|
||||
workers <- workerDescription // add new worker to the end of the queue
|
||||
for {
|
||||
request := <-workerDescription.Req
|
||||
|
||||
_, err := conn.Write([]byte("URL " + request.Url + "\n"))
|
||||
requestBase64 := b64.StdEncoding.EncodeToString([]byte(c.WorkerServerConfig.RequestMessagePrefix + request.Url))
|
||||
|
||||
_, err := conn.Write([]byte(requestBase64 + "\n"))
|
||||
if err != nil {
|
||||
log.Println("Cannot send to " + clientAddr)
|
||||
conn.Close()
|
||||
@@ -65,7 +71,7 @@ func handleConnection(conn net.Conn) {
|
||||
return // don't return worker to the queue - not healthy
|
||||
}
|
||||
|
||||
bufferBytes, err := bufio.NewReader(conn).ReadBytes('\n')
|
||||
bufferString, err := bufio.NewReader(conn).ReadString('\n')
|
||||
if err != nil {
|
||||
log.Println("client left..")
|
||||
conn.Close()
|
||||
@@ -73,10 +79,12 @@ func handleConnection(conn net.Conn) {
|
||||
return // don't return worker to the queue - not healthy
|
||||
}
|
||||
|
||||
decodedPageBody, _ := b64.StdEncoding.DecodeString(strings.TrimSpace(bufferString))
|
||||
|
||||
log.Printf("Recieving response from %s\n", clientAddr)
|
||||
response := structures.Response{
|
||||
Url: request.Url,
|
||||
Content: bufferBytes,
|
||||
Content: decodedPageBody,
|
||||
Err: nil,
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user