This commit is contained in:
=
2020-09-13 04:48:11 -07:00
parent 8df94da48c
commit a481ecfe37
4 changed files with 51 additions and 20 deletions

View File

@@ -59,16 +59,17 @@ async function crawlAll() {
AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE,
AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES,
AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES
),
new SaljicCrawler(
[postgresSaver],
SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE,
SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES,
SALJIC_CONFIG.SALJIC_MAX_PAGES,
SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE,
SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES,
SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES
)
//,
//new SaljicCrawler(
//[postgresSaver],
//SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE,
//SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES,
//SALJIC_CONFIG.SALJIC_MAX_PAGES,
//SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE,
//SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES,
//SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES
//)
];
const newRealEstates = [];

View File

@@ -1,6 +1,7 @@
"use strict";
const fetch = require("../../helpers/fetchWrapper");
const { logDebug } = require("../../helpers/log");
const cheerio = require("cheerio");
const Promise = require("bluebird");
const moment = require("moment-timezone");
@@ -45,6 +46,16 @@ const OLX_ENUMS = {
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
const chunk = (array, size = 10) => {
let i, j ,temparray;
const result = []
for (i=0,j=array.length; i<j; i+=size) {
temparray = array.slice(i,i+size);
result.push(temparray);
}
return result;
}
class OlxCrawler {
constructor(
savers = [],
@@ -66,6 +77,7 @@ class OlxCrawler {
}
async crawl() {
logDebug("Starting OLX crawl");
const crawlAdCategories = this.crawlerAdCategories;
const newRealEstates = [];
@@ -227,19 +239,21 @@ class OlxCrawler {
asyncScraping.push(hrefs[i]);
}
const allChunks = chunk(asyncScraping, 2);
const dataResults = []
const { scrapedData, errors } = await PromisePool
.withConcurrency(2)
.for(asyncScraping)
.process(async data => {
const result = await this.scrapeAd(data)
await this.sleep(this.delayBetweenPages);
dataResults.push(result)
return result; //TODO: this does not work, scrapedData is null, dataResults works
})
for (let i = 0; i < allChunks.length; i++) {
const singleChunk = allChunks[i];
const promises = singleChunk.map(c => this.scrapeAd(c))
const chunkResults = await Promise.all(promises);
await this.sleep(this.delayBetweenPages);
dataResults.push(...chunkResults);
logDebug("Chunk results len:", chunkResults.length);
}
const filteredScrapedData = dataResults.filter(adData => !!adData);
logDebug("Filtered scraped data length: ", filteredScrapedData.length);
return filteredScrapedData;
} catch (e) {
console.error("Exception caught:" + e);
@@ -248,7 +262,7 @@ class OlxCrawler {
}
async scrapeAd(url) {
//console.log("Scraping : ", url);
logDebug("Scraping : ", url);
try {
const adPageSource = await fetch(url);
const body = await adPageSource.text();

13
app/helpers/log.js Normal file
View File

@@ -0,0 +1,13 @@
const {
PRINT_CRAWLER_DEBUG
} = require("../config/appConfig");
const logDebug = (...args) => {
if (PRINT_CRAWLER_DEBUG) {
console.log(...args);
}
}
module.exports = {
logDebug
};

View File

@@ -4,6 +4,7 @@ const bodyParser = require("body-parser");
const layout = require("express-layout");
const compression = require("compression");
const forceSSL = require("./app/helpers/forceSSL");
const { logDebug } = require("./app/helpers/log");
const {
APP_PORT,
@@ -38,9 +39,11 @@ app.listen(APP_PORT, () =>
let crawlerRunning = STOP_CRAWLER;
const crawl = () => {
logDebug("Crawl start");
if (!crawlerRunning) {
crawlerRunning = true;
crawlAll().then(newRealEstates => {
logDebug("crawlAll done, new real estate len: ", newRealEstates.length);
crawlerRunning = false;
notifyForNewRealEstates(newRealEstates);
});