Debug
This commit is contained in:
@@ -59,16 +59,17 @@ async function crawlAll() {
|
|||||||
AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE,
|
AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE,
|
||||||
AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES,
|
AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES,
|
||||||
AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES
|
AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES
|
||||||
),
|
|
||||||
new SaljicCrawler(
|
|
||||||
[postgresSaver],
|
|
||||||
SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE,
|
|
||||||
SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES,
|
|
||||||
SALJIC_CONFIG.SALJIC_MAX_PAGES,
|
|
||||||
SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE,
|
|
||||||
SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES,
|
|
||||||
SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES
|
|
||||||
)
|
)
|
||||||
|
//,
|
||||||
|
//new SaljicCrawler(
|
||||||
|
//[postgresSaver],
|
||||||
|
//SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE,
|
||||||
|
//SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES,
|
||||||
|
//SALJIC_CONFIG.SALJIC_MAX_PAGES,
|
||||||
|
//SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE,
|
||||||
|
//SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES,
|
||||||
|
//SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES
|
||||||
|
//)
|
||||||
];
|
];
|
||||||
|
|
||||||
const newRealEstates = [];
|
const newRealEstates = [];
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
"use strict";
|
"use strict";
|
||||||
|
|
||||||
const fetch = require("../../helpers/fetchWrapper");
|
const fetch = require("../../helpers/fetchWrapper");
|
||||||
|
const { logDebug } = require("../../helpers/log");
|
||||||
const cheerio = require("cheerio");
|
const cheerio = require("cheerio");
|
||||||
const Promise = require("bluebird");
|
const Promise = require("bluebird");
|
||||||
const moment = require("moment-timezone");
|
const moment = require("moment-timezone");
|
||||||
@@ -45,6 +46,16 @@ const OLX_ENUMS = {
|
|||||||
|
|
||||||
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
|
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
|
||||||
|
|
||||||
|
const chunk = (array, size = 10) => {
|
||||||
|
let i, j ,temparray;
|
||||||
|
const result = []
|
||||||
|
for (i=0,j=array.length; i<j; i+=size) {
|
||||||
|
temparray = array.slice(i,i+size);
|
||||||
|
result.push(temparray);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
class OlxCrawler {
|
class OlxCrawler {
|
||||||
constructor(
|
constructor(
|
||||||
savers = [],
|
savers = [],
|
||||||
@@ -66,6 +77,7 @@ class OlxCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async crawl() {
|
async crawl() {
|
||||||
|
logDebug("Starting OLX crawl");
|
||||||
const crawlAdCategories = this.crawlerAdCategories;
|
const crawlAdCategories = this.crawlerAdCategories;
|
||||||
|
|
||||||
const newRealEstates = [];
|
const newRealEstates = [];
|
||||||
@@ -227,19 +239,21 @@ class OlxCrawler {
|
|||||||
asyncScraping.push(hrefs[i]);
|
asyncScraping.push(hrefs[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const allChunks = chunk(asyncScraping, 2);
|
||||||
const dataResults = []
|
const dataResults = []
|
||||||
const { scrapedData, errors } = await PromisePool
|
for (let i = 0; i < allChunks.length; i++) {
|
||||||
.withConcurrency(2)
|
const singleChunk = allChunks[i];
|
||||||
.for(asyncScraping)
|
const promises = singleChunk.map(c => this.scrapeAd(c))
|
||||||
.process(async data => {
|
const chunkResults = await Promise.all(promises);
|
||||||
const result = await this.scrapeAd(data)
|
await this.sleep(this.delayBetweenPages);
|
||||||
await this.sleep(this.delayBetweenPages);
|
dataResults.push(...chunkResults);
|
||||||
dataResults.push(result)
|
logDebug("Chunk results len:", chunkResults.length);
|
||||||
return result; //TODO: this does not work, scrapedData is null, dataResults works
|
}
|
||||||
})
|
|
||||||
|
|
||||||
const filteredScrapedData = dataResults.filter(adData => !!adData);
|
const filteredScrapedData = dataResults.filter(adData => !!adData);
|
||||||
|
logDebug("Filtered scraped data length: ", filteredScrapedData.length);
|
||||||
|
|
||||||
return filteredScrapedData;
|
return filteredScrapedData;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error("Exception caught:" + e);
|
console.error("Exception caught:" + e);
|
||||||
@@ -248,7 +262,7 @@ class OlxCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async scrapeAd(url) {
|
async scrapeAd(url) {
|
||||||
//console.log("Scraping : ", url);
|
logDebug("Scraping : ", url);
|
||||||
try {
|
try {
|
||||||
const adPageSource = await fetch(url);
|
const adPageSource = await fetch(url);
|
||||||
const body = await adPageSource.text();
|
const body = await adPageSource.text();
|
||||||
|
|||||||
13
app/helpers/log.js
Normal file
13
app/helpers/log.js
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
const {
|
||||||
|
PRINT_CRAWLER_DEBUG
|
||||||
|
} = require("../config/appConfig");
|
||||||
|
|
||||||
|
const logDebug = (...args) => {
|
||||||
|
if (PRINT_CRAWLER_DEBUG) {
|
||||||
|
console.log(...args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
logDebug
|
||||||
|
};
|
||||||
3
index.js
3
index.js
@@ -4,6 +4,7 @@ const bodyParser = require("body-parser");
|
|||||||
const layout = require("express-layout");
|
const layout = require("express-layout");
|
||||||
const compression = require("compression");
|
const compression = require("compression");
|
||||||
const forceSSL = require("./app/helpers/forceSSL");
|
const forceSSL = require("./app/helpers/forceSSL");
|
||||||
|
const { logDebug } = require("./app/helpers/log");
|
||||||
|
|
||||||
const {
|
const {
|
||||||
APP_PORT,
|
APP_PORT,
|
||||||
@@ -38,9 +39,11 @@ app.listen(APP_PORT, () =>
|
|||||||
|
|
||||||
let crawlerRunning = STOP_CRAWLER;
|
let crawlerRunning = STOP_CRAWLER;
|
||||||
const crawl = () => {
|
const crawl = () => {
|
||||||
|
logDebug("Crawl start");
|
||||||
if (!crawlerRunning) {
|
if (!crawlerRunning) {
|
||||||
crawlerRunning = true;
|
crawlerRunning = true;
|
||||||
crawlAll().then(newRealEstates => {
|
crawlAll().then(newRealEstates => {
|
||||||
|
logDebug("crawlAll done, new real estate len: ", newRealEstates.length);
|
||||||
crawlerRunning = false;
|
crawlerRunning = false;
|
||||||
notifyForNewRealEstates(newRealEstates);
|
notifyForNewRealEstates(newRealEstates);
|
||||||
});
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user