refactor Prostor crawler

This commit is contained in:
Bilal Catic
2019-11-13 16:54:16 +01:00
parent ae93d2f03d
commit 3b3e2eda07
2 changed files with 317 additions and 18 deletions

View File

@@ -29,5 +29,6 @@ module.exports = {
PROSTOR_CRAWLER_AD_CATEGORIES: transformedProstorCrawlerAdCategories,
PROSTOR_IGNORED_USERNAMES: prostorIgnoredUsernames || [],
PROSTOR_DELAY_BETWEEN_PAGES:
parseInt(process.env.PROSTOR_DELAY_BETWEEN_PAGES) || 1000
parseInt(process.env.PROSTOR_DELAY_BETWEEN_PAGES) || 1000,
PROSTOR_FORCE_CRAWL: !!parseInt(process.env.PROSTOR_FORCE_CRAWL)
};

View File

@@ -2,6 +2,7 @@
const fetch = require("node-fetch");
const cheerio = require("cheerio");
const moment = require("moment-timezone");
const {
AD_TYPE,
@@ -11,7 +12,11 @@ const {
CRAWLER_AD_TYPE
} = require("../../common/enums");
const { PRINT_CRAWLER_DEBUG } = require("../../config/appConfig");
const {
PRINT_CRAWLER_DEBUG,
DEFAULT_TIMEZONE
} = require("../../config/appConfig");
const { PROSTOR_FORCE_CRAWL } = require("../specificConfigs/prostor");
const PROSTOR_ENUMS = {
PROSTOR_AD_TYPE: {
@@ -48,9 +53,10 @@ class ProstorCrawler {
this.crawlerAdTypes = crawlerAdTypes;
this.crawlerAdCategories = crawlerAdCategories;
this.maxResultsPerPage = maxResultsPerPage;
this.delayBetweenPages = delayBetweenPages;
}
async crawl() {
async crawlOld() {
const crawlAdCategories = this.crawlerAdCategories;
const newRealEstates = [];
@@ -79,6 +85,290 @@ class ProstorCrawler {
return newRealEstates;
}
async crawl() {
const crawlAdCategories = this.crawlerAdCategories;
const newRealEstates = [];
if (crawlAdCategories) {
const indexGenerators = [];
for (const adCategory of crawlAdCategories) {
indexGenerators.push(this.categoryIndexer(adCategory));
}
let done = false;
while (!done) {
const categoryIndexerPromises = [];
const generatorsToRemove = [];
for (const indexGenerator of indexGenerators) {
categoryIndexerPromises.push(indexGenerator.next());
generatorsToRemove.push(false);
}
const singlePageResults = await Promise.all(categoryIndexerPromises);
const entries = singlePageResults.entries();
for (const [index, { value: singlePageResult }] of entries) {
if (singlePageResult) {
const saveResults = await this.saveCrawledResults(singlePageResult);
const { newRecords } = saveResults;
newRealEstates.push(...newRecords);
if (
Array.isArray(newRecords) &&
newRecords.length === 0 &&
!PROSTOR_FORCE_CRAWL
) {
generatorsToRemove[index] = true;
}
} else {
//Generator returned undefined, remove this generator from array
generatorsToRemove[index] = true;
// console.log("Generator ", index + 1, "has no more pages");
}
}
// console.log("Generators state : ", generatorsToRemove);
for (let i = generatorsToRemove.length - 1; i >= 0; i--) {
if (generatorsToRemove[i]) {
// console.log("\tRemove generator ", i + 1);
indexGenerators.splice(i, 1);
}
}
if (indexGenerators.length === 0) {
done = true;
}
await this.sleep(this.delayBetweenPages);
}
}
return newRealEstates;
}
async *categoryIndexer(adCategory) {
const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
const urlPageToCrawl = `${this.baseUrl}?remove_sold=1${urlAdTypePart}${urlCategoryPart}`;
const listOfAllRealEstates = await this.extractRealEstates(
urlPageToCrawl
);
let elementToStartIndexFrom = 0;
while (true) {
const realEstatesForSinglePage = listOfAllRealEstates.slice(
elementToStartIndexFrom,
elementToStartIndexFrom + this.maxResultsPerPage
);
if (realEstatesForSinglePage.length > 0) {
elementToStartIndexFrom += realEstatesForSinglePage.length;
const singlePageResults = await this.indexSinglePage(
realEstatesForSinglePage
);
const filteredSinglePageResults = singlePageResults.filter(
singleResult => !!singleResult
);
if (
Array.isArray(filteredSinglePageResults) &&
filteredSinglePageResults.length > 0
) {
yield filteredSinglePageResults;
} else {
return undefined;
}
} else {
return undefined;
}
}
} else {
return undefined;
}
}
async indexSinglePage(realEstatesList) {
const asyncActions = [];
for (const realEstate of realEstatesList) {
asyncActions.push(this.scrapeAd(realEstate));
}
try {
return await Promise.all(asyncActions);
} catch (e) {
console.log(
"[PROSTOR] Error crawling ads : ",
e.message || "UNKNOWN ERROR"
);
return [];
}
}
async scrapeAd(realEstate) {
const { lat, lng, property_name, price, size, link } = realEstate;
const url = `https://prostor.ba${link}`;
console.log("[PROSTOR] Scraping : ", url);
try {
const adPageSource = await fetch(url);
const body = await adPageSource.text();
const $ = cheerio.load(body);
let numberOfRooms = null,
numberOfFloors = null,
floor = null,
accessRoadType = null,
heatingType = null,
furnishingType = null,
balcony = null,
newBuilding = null,
elevator = null,
water = null,
electricity = null,
drainageSystem = null,
registeredInZkBooks = null,
recentlyAdapted = null,
parking = null,
garage = null,
gas = null,
antiTheftDoor = null,
airCondition = null,
phoneConnection = null,
cableTV = null,
internet = null,
basementAttic = null,
storeRoom = null,
videoSurveillance = null,
alarm = null,
suitableForStudents = null,
includingBills = null,
animalsAllowed = null,
pool = null,
urbanPlanPermit = null,
buildingPermit = null,
utilityConnection = null,
distanceToRiver = null,
numberOfViewsAgency = null;
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
// general form is : /actionType/realEstateType/location/realEstateID
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
const linkParts = link.split("/");
const adType = ProstorCrawler.getAdTypeId(linkParts[1]);
const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]);
const prostorId = linkParts[4];
if (!adType || !realEstateType || !prostorId) {
console.log(
"adType: ",
adType,
" reType: ",
realEstateType,
" prostorId: ",
prostorId,
"url: ",
url
);
return null;
}
const adStatus = AD_STATUS.STATUS_NORMAL;
const title = property_name;
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
const parsedArea = parseFloat(size);
const gardenSize = null;
const longDescription = null;
const publishedDateMoment = moment.tz(DEFAULT_TIMEZONE);
if (!publishedDateMoment.isValid()) {
throw {
message: `Invalid published date`
};
}
const renewedDateMoment = moment.tz(DEFAULT_TIMEZONE);
if (!renewedDateMoment.isValid()) {
throw {
message: `Invalid renewed date`
};
}
const data = {
url,
agencyObjectId: prostorId,
originAgencyName: AD_AGENCY.PROSTOR,
realEstateType,
adType,
title,
price: parsedPrice,
area: parsedArea,
gardenSize,
shortDescription: "",
longDescription: longDescription,
streetNumber: 0,
streetName: "",
locality: "",
municipality: "",
city: "",
region: "",
entity: "",
country: "",
locationLat: lat,
locationLong: lng,
adStatus,
publishedDate: publishedDateMoment.toISOString(),
renewedDate: renewedDateMoment.toISOString(),
numberOfRooms,
numberOfFloors,
floor,
accessRoadType,
heatingType,
furnishingType,
balcony,
newBuilding,
elevator,
water,
electricity,
drainageSystem,
registeredInZkBooks,
recentlyAdapted,
parking,
garage,
gas,
antiTheftDoor,
airCondition,
phoneConnection,
cableTV,
internet,
basementAttic,
storeRoom,
videoSurveillance,
alarm,
suitableForStudents,
includingBills,
animalsAllowed,
pool,
urbanPlanPermit,
buildingPermit,
utilityConnection,
distanceToRiver,
numberOfViewsAgency
};
return data;
} catch (e) {
console.error(
"[PROSTOR] Exception caught: " + e.message,
"\r\nURL:",
url
);
return null;
}
}
async extractRealEstates(url) {
if (PRINT_CRAWLER_DEBUG) {
console.log("[PROSTOR] Index page : ", url);
@@ -115,18 +405,19 @@ class ProstorCrawler {
const jsonData = scriptData.substring(23, jsonEndIndex) + "]";
const realEstates = JSON.parse(jsonData);
const transformedRealEstates = [];
for (const realEstate of realEstates) {
const transformedRealEstate = ProstorCrawler.transformRealEstateData(
realEstate
);
if (transformedRealEstate) {
transformedRealEstates.push(transformedRealEstate);
}
}
return transformedRealEstates;
// const transformedRealEstates = [];
//
// for (const realEstate of realEstates) {
// const transformedRealEstate = ProstorCrawler.transformRealEstateData(
// realEstate
// );
// if (transformedRealEstate) {
// transformedRealEstates.push(transformedRealEstate);
// }
// }
//
// return transformedRealEstates;
return realEstates;
} else {
throw {
message: "Something is wrong with JSON data or data is moved"
@@ -134,11 +425,14 @@ class ProstorCrawler {
}
} catch (e) {
console.log(e);
throw { message: "Can't find ad data JSON" };
throw e;
}
}
} catch (e) {
console.error("[PROSTOR] Exception caught:", e.message);
console.error(
"[PROSTOR] Exception caught:",
e.message || "UNKNOWN MESSAGE"
);
return [];
}
}
@@ -236,6 +530,10 @@ class ProstorCrawler {
}
}
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async saveCrawledResults(results) {
const savers = this.savers;
@@ -244,7 +542,7 @@ class ProstorCrawler {
// }
//For now, we use only Postgres saver, so ...
return await savers[0].save(results);
return savers[0].save(results);
//so that we can use some sequelize options and information when data is inserted
}
}