Debug
This commit is contained in:
@@ -59,16 +59,17 @@ async function crawlAll() {
|
||||
AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE,
|
||||
AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES,
|
||||
AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES
|
||||
),
|
||||
new SaljicCrawler(
|
||||
[postgresSaver],
|
||||
SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE,
|
||||
SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES,
|
||||
SALJIC_CONFIG.SALJIC_MAX_PAGES,
|
||||
SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE,
|
||||
SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES,
|
||||
SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES
|
||||
)
|
||||
//,
|
||||
//new SaljicCrawler(
|
||||
//[postgresSaver],
|
||||
//SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE,
|
||||
//SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES,
|
||||
//SALJIC_CONFIG.SALJIC_MAX_PAGES,
|
||||
//SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE,
|
||||
//SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES,
|
||||
//SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES
|
||||
//)
|
||||
];
|
||||
|
||||
const newRealEstates = [];
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"use strict";
|
||||
|
||||
const fetch = require("../../helpers/fetchWrapper");
|
||||
const { logDebug } = require("../../helpers/log");
|
||||
const cheerio = require("cheerio");
|
||||
const Promise = require("bluebird");
|
||||
const moment = require("moment-timezone");
|
||||
@@ -45,6 +46,16 @@ const OLX_ENUMS = {
|
||||
|
||||
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
|
||||
|
||||
const chunk = (array, size = 10) => {
|
||||
let i, j ,temparray;
|
||||
const result = []
|
||||
for (i=0,j=array.length; i<j; i+=size) {
|
||||
temparray = array.slice(i,i+size);
|
||||
result.push(temparray);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
class OlxCrawler {
|
||||
constructor(
|
||||
savers = [],
|
||||
@@ -66,6 +77,7 @@ class OlxCrawler {
|
||||
}
|
||||
|
||||
async crawl() {
|
||||
logDebug("Starting OLX crawl");
|
||||
const crawlAdCategories = this.crawlerAdCategories;
|
||||
|
||||
const newRealEstates = [];
|
||||
@@ -227,19 +239,21 @@ class OlxCrawler {
|
||||
asyncScraping.push(hrefs[i]);
|
||||
}
|
||||
|
||||
|
||||
const allChunks = chunk(asyncScraping, 2);
|
||||
const dataResults = []
|
||||
const { scrapedData, errors } = await PromisePool
|
||||
.withConcurrency(2)
|
||||
.for(asyncScraping)
|
||||
.process(async data => {
|
||||
const result = await this.scrapeAd(data)
|
||||
await this.sleep(this.delayBetweenPages);
|
||||
dataResults.push(result)
|
||||
return result; //TODO: this does not work, scrapedData is null, dataResults works
|
||||
})
|
||||
for (let i = 0; i < allChunks.length; i++) {
|
||||
const singleChunk = allChunks[i];
|
||||
const promises = singleChunk.map(c => this.scrapeAd(c))
|
||||
const chunkResults = await Promise.all(promises);
|
||||
await this.sleep(this.delayBetweenPages);
|
||||
dataResults.push(...chunkResults);
|
||||
logDebug("Chunk results len:", chunkResults.length);
|
||||
}
|
||||
|
||||
|
||||
const filteredScrapedData = dataResults.filter(adData => !!adData);
|
||||
logDebug("Filtered scraped data length: ", filteredScrapedData.length);
|
||||
|
||||
return filteredScrapedData;
|
||||
} catch (e) {
|
||||
console.error("Exception caught:" + e);
|
||||
@@ -248,7 +262,7 @@ class OlxCrawler {
|
||||
}
|
||||
|
||||
async scrapeAd(url) {
|
||||
//console.log("Scraping : ", url);
|
||||
logDebug("Scraping : ", url);
|
||||
try {
|
||||
const adPageSource = await fetch(url);
|
||||
const body = await adPageSource.text();
|
||||
|
||||
13
app/helpers/log.js
Normal file
13
app/helpers/log.js
Normal file
@@ -0,0 +1,13 @@
|
||||
const {
|
||||
PRINT_CRAWLER_DEBUG
|
||||
} = require("../config/appConfig");
|
||||
|
||||
const logDebug = (...args) => {
|
||||
if (PRINT_CRAWLER_DEBUG) {
|
||||
console.log(...args);
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
logDebug
|
||||
};
|
||||
3
index.js
3
index.js
@@ -4,6 +4,7 @@ const bodyParser = require("body-parser");
|
||||
const layout = require("express-layout");
|
||||
const compression = require("compression");
|
||||
const forceSSL = require("./app/helpers/forceSSL");
|
||||
const { logDebug } = require("./app/helpers/log");
|
||||
|
||||
const {
|
||||
APP_PORT,
|
||||
@@ -38,9 +39,11 @@ app.listen(APP_PORT, () =>
|
||||
|
||||
let crawlerRunning = STOP_CRAWLER;
|
||||
const crawl = () => {
|
||||
logDebug("Crawl start");
|
||||
if (!crawlerRunning) {
|
||||
crawlerRunning = true;
|
||||
crawlAll().then(newRealEstates => {
|
||||
logDebug("crawlAll done, new real estate len: ", newRealEstates.length);
|
||||
crawlerRunning = false;
|
||||
notifyForNewRealEstates(newRealEstates);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user