Debug
This commit is contained in:
@@ -59,16 +59,17 @@ async function crawlAll() {
|
||||
AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE,
|
||||
AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES,
|
||||
AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES
|
||||
),
|
||||
new SaljicCrawler(
|
||||
[postgresSaver],
|
||||
SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE,
|
||||
SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES,
|
||||
SALJIC_CONFIG.SALJIC_MAX_PAGES,
|
||||
SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE,
|
||||
SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES,
|
||||
SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES
|
||||
)
|
||||
//,
|
||||
//new SaljicCrawler(
|
||||
//[postgresSaver],
|
||||
//SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE,
|
||||
//SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES,
|
||||
//SALJIC_CONFIG.SALJIC_MAX_PAGES,
|
||||
//SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE,
|
||||
//SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES,
|
||||
//SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES
|
||||
//)
|
||||
];
|
||||
|
||||
const newRealEstates = [];
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"use strict";
|
||||
|
||||
const fetch = require("../../helpers/fetchWrapper");
|
||||
const { logDebug } = require("../../helpers/log");
|
||||
const cheerio = require("cheerio");
|
||||
const Promise = require("bluebird");
|
||||
const moment = require("moment-timezone");
|
||||
@@ -45,6 +46,16 @@ const OLX_ENUMS = {
|
||||
|
||||
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
|
||||
|
||||
const chunk = (array, size = 10) => {
|
||||
let i, j ,temparray;
|
||||
const result = []
|
||||
for (i=0,j=array.length; i<j; i+=size) {
|
||||
temparray = array.slice(i,i+size);
|
||||
result.push(temparray);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
class OlxCrawler {
|
||||
constructor(
|
||||
savers = [],
|
||||
@@ -66,6 +77,7 @@ class OlxCrawler {
|
||||
}
|
||||
|
||||
async crawl() {
|
||||
logDebug("Starting OLX crawl");
|
||||
const crawlAdCategories = this.crawlerAdCategories;
|
||||
|
||||
const newRealEstates = [];
|
||||
@@ -227,19 +239,21 @@ class OlxCrawler {
|
||||
asyncScraping.push(hrefs[i]);
|
||||
}
|
||||
|
||||
|
||||
const allChunks = chunk(asyncScraping, 2);
|
||||
const dataResults = []
|
||||
const { scrapedData, errors } = await PromisePool
|
||||
.withConcurrency(2)
|
||||
.for(asyncScraping)
|
||||
.process(async data => {
|
||||
const result = await this.scrapeAd(data)
|
||||
await this.sleep(this.delayBetweenPages);
|
||||
dataResults.push(result)
|
||||
return result; //TODO: this does not work, scrapedData is null, dataResults works
|
||||
})
|
||||
for (let i = 0; i < allChunks.length; i++) {
|
||||
const singleChunk = allChunks[i];
|
||||
const promises = singleChunk.map(c => this.scrapeAd(c))
|
||||
const chunkResults = await Promise.all(promises);
|
||||
await this.sleep(this.delayBetweenPages);
|
||||
dataResults.push(...chunkResults);
|
||||
logDebug("Chunk results len:", chunkResults.length);
|
||||
}
|
||||
|
||||
|
||||
const filteredScrapedData = dataResults.filter(adData => !!adData);
|
||||
logDebug("Filtered scraped data length: ", filteredScrapedData.length);
|
||||
|
||||
return filteredScrapedData;
|
||||
} catch (e) {
|
||||
console.error("Exception caught:" + e);
|
||||
@@ -248,7 +262,7 @@ class OlxCrawler {
|
||||
}
|
||||
|
||||
async scrapeAd(url) {
|
||||
//console.log("Scraping : ", url);
|
||||
logDebug("Scraping : ", url);
|
||||
try {
|
||||
const adPageSource = await fetch(url);
|
||||
const body = await adPageSource.text();
|
||||
|
||||
13
app/helpers/log.js
Normal file
13
app/helpers/log.js
Normal file
@@ -0,0 +1,13 @@
|
||||
const {
|
||||
PRINT_CRAWLER_DEBUG
|
||||
} = require("../config/appConfig");
|
||||
|
||||
const logDebug = (...args) => {
|
||||
if (PRINT_CRAWLER_DEBUG) {
|
||||
console.log(...args);
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
logDebug
|
||||
};
|
||||
Reference in New Issue
Block a user