refactor Prostor crawler
This commit is contained in:
@@ -29,5 +29,6 @@ module.exports = {
|
||||
PROSTOR_CRAWLER_AD_CATEGORIES: transformedProstorCrawlerAdCategories,
|
||||
PROSTOR_IGNORED_USERNAMES: prostorIgnoredUsernames || [],
|
||||
PROSTOR_DELAY_BETWEEN_PAGES:
|
||||
parseInt(process.env.PROSTOR_DELAY_BETWEEN_PAGES) || 1000
|
||||
parseInt(process.env.PROSTOR_DELAY_BETWEEN_PAGES) || 1000,
|
||||
PROSTOR_FORCE_CRAWL: !!parseInt(process.env.PROSTOR_FORCE_CRAWL)
|
||||
};
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
const fetch = require("node-fetch");
|
||||
const cheerio = require("cheerio");
|
||||
const moment = require("moment-timezone");
|
||||
|
||||
const {
|
||||
AD_TYPE,
|
||||
@@ -11,7 +12,11 @@ const {
|
||||
CRAWLER_AD_TYPE
|
||||
} = require("../../common/enums");
|
||||
|
||||
const { PRINT_CRAWLER_DEBUG } = require("../../config/appConfig");
|
||||
const {
|
||||
PRINT_CRAWLER_DEBUG,
|
||||
DEFAULT_TIMEZONE
|
||||
} = require("../../config/appConfig");
|
||||
const { PROSTOR_FORCE_CRAWL } = require("../specificConfigs/prostor");
|
||||
|
||||
const PROSTOR_ENUMS = {
|
||||
PROSTOR_AD_TYPE: {
|
||||
@@ -48,9 +53,10 @@ class ProstorCrawler {
|
||||
this.crawlerAdTypes = crawlerAdTypes;
|
||||
this.crawlerAdCategories = crawlerAdCategories;
|
||||
this.maxResultsPerPage = maxResultsPerPage;
|
||||
this.delayBetweenPages = delayBetweenPages;
|
||||
}
|
||||
|
||||
async crawl() {
|
||||
async crawlOld() {
|
||||
const crawlAdCategories = this.crawlerAdCategories;
|
||||
const newRealEstates = [];
|
||||
|
||||
@@ -79,6 +85,290 @@ class ProstorCrawler {
|
||||
return newRealEstates;
|
||||
}
|
||||
|
||||
async crawl() {
|
||||
const crawlAdCategories = this.crawlerAdCategories;
|
||||
|
||||
const newRealEstates = [];
|
||||
|
||||
if (crawlAdCategories) {
|
||||
const indexGenerators = [];
|
||||
for (const adCategory of crawlAdCategories) {
|
||||
indexGenerators.push(this.categoryIndexer(adCategory));
|
||||
}
|
||||
|
||||
let done = false;
|
||||
while (!done) {
|
||||
const categoryIndexerPromises = [];
|
||||
const generatorsToRemove = [];
|
||||
for (const indexGenerator of indexGenerators) {
|
||||
categoryIndexerPromises.push(indexGenerator.next());
|
||||
generatorsToRemove.push(false);
|
||||
}
|
||||
|
||||
const singlePageResults = await Promise.all(categoryIndexerPromises);
|
||||
const entries = singlePageResults.entries();
|
||||
|
||||
for (const [index, { value: singlePageResult }] of entries) {
|
||||
if (singlePageResult) {
|
||||
const saveResults = await this.saveCrawledResults(singlePageResult);
|
||||
const { newRecords } = saveResults;
|
||||
|
||||
newRealEstates.push(...newRecords);
|
||||
|
||||
if (
|
||||
Array.isArray(newRecords) &&
|
||||
newRecords.length === 0 &&
|
||||
!PROSTOR_FORCE_CRAWL
|
||||
) {
|
||||
generatorsToRemove[index] = true;
|
||||
}
|
||||
} else {
|
||||
//Generator returned undefined, remove this generator from array
|
||||
generatorsToRemove[index] = true;
|
||||
// console.log("Generator ", index + 1, "has no more pages");
|
||||
}
|
||||
}
|
||||
|
||||
// console.log("Generators state : ", generatorsToRemove);
|
||||
for (let i = generatorsToRemove.length - 1; i >= 0; i--) {
|
||||
if (generatorsToRemove[i]) {
|
||||
// console.log("\tRemove generator ", i + 1);
|
||||
indexGenerators.splice(i, 1);
|
||||
}
|
||||
}
|
||||
if (indexGenerators.length === 0) {
|
||||
done = true;
|
||||
}
|
||||
|
||||
await this.sleep(this.delayBetweenPages);
|
||||
}
|
||||
}
|
||||
return newRealEstates;
|
||||
}
|
||||
|
||||
async *categoryIndexer(adCategory) {
|
||||
const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
|
||||
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
|
||||
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
||||
const urlPageToCrawl = `${this.baseUrl}?remove_sold=1${urlAdTypePart}${urlCategoryPart}`;
|
||||
const listOfAllRealEstates = await this.extractRealEstates(
|
||||
urlPageToCrawl
|
||||
);
|
||||
|
||||
let elementToStartIndexFrom = 0;
|
||||
while (true) {
|
||||
const realEstatesForSinglePage = listOfAllRealEstates.slice(
|
||||
elementToStartIndexFrom,
|
||||
elementToStartIndexFrom + this.maxResultsPerPage
|
||||
);
|
||||
|
||||
if (realEstatesForSinglePage.length > 0) {
|
||||
elementToStartIndexFrom += realEstatesForSinglePage.length;
|
||||
|
||||
const singlePageResults = await this.indexSinglePage(
|
||||
realEstatesForSinglePage
|
||||
);
|
||||
|
||||
const filteredSinglePageResults = singlePageResults.filter(
|
||||
singleResult => !!singleResult
|
||||
);
|
||||
|
||||
if (
|
||||
Array.isArray(filteredSinglePageResults) &&
|
||||
filteredSinglePageResults.length > 0
|
||||
) {
|
||||
yield filteredSinglePageResults;
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
async indexSinglePage(realEstatesList) {
|
||||
const asyncActions = [];
|
||||
for (const realEstate of realEstatesList) {
|
||||
asyncActions.push(this.scrapeAd(realEstate));
|
||||
}
|
||||
|
||||
try {
|
||||
return await Promise.all(asyncActions);
|
||||
} catch (e) {
|
||||
console.log(
|
||||
"[PROSTOR] Error crawling ads : ",
|
||||
e.message || "UNKNOWN ERROR"
|
||||
);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
async scrapeAd(realEstate) {
|
||||
const { lat, lng, property_name, price, size, link } = realEstate;
|
||||
const url = `https://prostor.ba${link}`;
|
||||
console.log("[PROSTOR] Scraping : ", url);
|
||||
try {
|
||||
const adPageSource = await fetch(url);
|
||||
const body = await adPageSource.text();
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
let numberOfRooms = null,
|
||||
numberOfFloors = null,
|
||||
floor = null,
|
||||
accessRoadType = null,
|
||||
heatingType = null,
|
||||
furnishingType = null,
|
||||
balcony = null,
|
||||
newBuilding = null,
|
||||
elevator = null,
|
||||
water = null,
|
||||
electricity = null,
|
||||
drainageSystem = null,
|
||||
registeredInZkBooks = null,
|
||||
recentlyAdapted = null,
|
||||
parking = null,
|
||||
garage = null,
|
||||
gas = null,
|
||||
antiTheftDoor = null,
|
||||
airCondition = null,
|
||||
phoneConnection = null,
|
||||
cableTV = null,
|
||||
internet = null,
|
||||
basementAttic = null,
|
||||
storeRoom = null,
|
||||
videoSurveillance = null,
|
||||
alarm = null,
|
||||
suitableForStudents = null,
|
||||
includingBills = null,
|
||||
animalsAllowed = null,
|
||||
pool = null,
|
||||
urbanPlanPermit = null,
|
||||
buildingPermit = null,
|
||||
utilityConnection = null,
|
||||
distanceToRiver = null,
|
||||
numberOfViewsAgency = null;
|
||||
|
||||
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
|
||||
// general form is : /actionType/realEstateType/location/realEstateID
|
||||
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
|
||||
|
||||
const linkParts = link.split("/");
|
||||
|
||||
const adType = ProstorCrawler.getAdTypeId(linkParts[1]);
|
||||
const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]);
|
||||
const prostorId = linkParts[4];
|
||||
|
||||
if (!adType || !realEstateType || !prostorId) {
|
||||
console.log(
|
||||
"adType: ",
|
||||
adType,
|
||||
" reType: ",
|
||||
realEstateType,
|
||||
" prostorId: ",
|
||||
prostorId,
|
||||
"url: ",
|
||||
url
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
const adStatus = AD_STATUS.STATUS_NORMAL;
|
||||
const title = property_name;
|
||||
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
|
||||
const parsedArea = parseFloat(size);
|
||||
const gardenSize = null;
|
||||
const longDescription = null;
|
||||
const publishedDateMoment = moment.tz(DEFAULT_TIMEZONE);
|
||||
if (!publishedDateMoment.isValid()) {
|
||||
throw {
|
||||
message: `Invalid published date`
|
||||
};
|
||||
}
|
||||
|
||||
const renewedDateMoment = moment.tz(DEFAULT_TIMEZONE);
|
||||
if (!renewedDateMoment.isValid()) {
|
||||
throw {
|
||||
message: `Invalid renewed date`
|
||||
};
|
||||
}
|
||||
|
||||
const data = {
|
||||
url,
|
||||
agencyObjectId: prostorId,
|
||||
originAgencyName: AD_AGENCY.PROSTOR,
|
||||
realEstateType,
|
||||
adType,
|
||||
title,
|
||||
price: parsedPrice,
|
||||
area: parsedArea,
|
||||
gardenSize,
|
||||
shortDescription: "",
|
||||
longDescription: longDescription,
|
||||
streetNumber: 0,
|
||||
streetName: "",
|
||||
locality: "",
|
||||
municipality: "",
|
||||
city: "",
|
||||
region: "",
|
||||
entity: "",
|
||||
country: "",
|
||||
locationLat: lat,
|
||||
locationLong: lng,
|
||||
adStatus,
|
||||
publishedDate: publishedDateMoment.toISOString(),
|
||||
renewedDate: renewedDateMoment.toISOString(),
|
||||
numberOfRooms,
|
||||
numberOfFloors,
|
||||
floor,
|
||||
accessRoadType,
|
||||
heatingType,
|
||||
furnishingType,
|
||||
balcony,
|
||||
newBuilding,
|
||||
elevator,
|
||||
water,
|
||||
electricity,
|
||||
drainageSystem,
|
||||
registeredInZkBooks,
|
||||
recentlyAdapted,
|
||||
parking,
|
||||
garage,
|
||||
gas,
|
||||
antiTheftDoor,
|
||||
airCondition,
|
||||
phoneConnection,
|
||||
cableTV,
|
||||
internet,
|
||||
basementAttic,
|
||||
storeRoom,
|
||||
videoSurveillance,
|
||||
alarm,
|
||||
suitableForStudents,
|
||||
includingBills,
|
||||
animalsAllowed,
|
||||
pool,
|
||||
urbanPlanPermit,
|
||||
buildingPermit,
|
||||
utilityConnection,
|
||||
distanceToRiver,
|
||||
numberOfViewsAgency
|
||||
};
|
||||
|
||||
return data;
|
||||
} catch (e) {
|
||||
console.error(
|
||||
"[PROSTOR] Exception caught: " + e.message,
|
||||
"\r\nURL:",
|
||||
url
|
||||
);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async extractRealEstates(url) {
|
||||
if (PRINT_CRAWLER_DEBUG) {
|
||||
console.log("[PROSTOR] Index page : ", url);
|
||||
@@ -115,18 +405,19 @@ class ProstorCrawler {
|
||||
const jsonData = scriptData.substring(23, jsonEndIndex) + "]";
|
||||
const realEstates = JSON.parse(jsonData);
|
||||
|
||||
const transformedRealEstates = [];
|
||||
|
||||
for (const realEstate of realEstates) {
|
||||
const transformedRealEstate = ProstorCrawler.transformRealEstateData(
|
||||
realEstate
|
||||
);
|
||||
if (transformedRealEstate) {
|
||||
transformedRealEstates.push(transformedRealEstate);
|
||||
}
|
||||
}
|
||||
|
||||
return transformedRealEstates;
|
||||
// const transformedRealEstates = [];
|
||||
//
|
||||
// for (const realEstate of realEstates) {
|
||||
// const transformedRealEstate = ProstorCrawler.transformRealEstateData(
|
||||
// realEstate
|
||||
// );
|
||||
// if (transformedRealEstate) {
|
||||
// transformedRealEstates.push(transformedRealEstate);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// return transformedRealEstates;
|
||||
return realEstates;
|
||||
} else {
|
||||
throw {
|
||||
message: "Something is wrong with JSON data or data is moved"
|
||||
@@ -134,11 +425,14 @@ class ProstorCrawler {
|
||||
}
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
throw { message: "Can't find ad data JSON" };
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.error("[PROSTOR] Exception caught:", e.message);
|
||||
console.error(
|
||||
"[PROSTOR] Exception caught:",
|
||||
e.message || "UNKNOWN MESSAGE"
|
||||
);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
@@ -236,6 +530,10 @@ class ProstorCrawler {
|
||||
}
|
||||
}
|
||||
|
||||
async sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async saveCrawledResults(results) {
|
||||
const savers = this.savers;
|
||||
|
||||
@@ -244,7 +542,7 @@ class ProstorCrawler {
|
||||
// }
|
||||
|
||||
//For now, we use only Postgres saver, so ...
|
||||
return await savers[0].save(results);
|
||||
return savers[0].save(results);
|
||||
//so that we can use some sequelize options and information when data is inserted
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user