Files
old-web/app/crawler/specificCrawlers/saljic.js
2020-02-20 19:49:29 +01:00

631 lines
18 KiB
JavaScript

"use strict";
const fetch = require("../../helpers/fetchWrapper");
const cheerio = require("cheerio");
const moment = require("moment-timezone");
const {
AD_TYPE,
AD_CATEGORY,
AD_AGENCY,
AD_STATUS,
CRAWLER_AD_TYPE,
FURNISHING_TYPE,
HEATING_TYPE
} = require("../../common/enums");
const {
PRINT_CRAWLER_DEBUG,
DEFAULT_TIMEZONE
} = require("../../config/appConfig");
const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic");
const SALJIC_ENUMS = {
SALJIC_AD_TYPE: {
[CRAWLER_AD_TYPE.ALL]: "&input_vrsta=",
[CRAWLER_AD_TYPE.ONLY_SELL]: "&input_vrsta=1",
[CRAWLER_AD_TYPE.ONLY_RENT]: "&input_vrsta=2"
},
SALJIC_AD_CATEGORY: {
[AD_CATEGORY.ALL.id]: "&input_kategorija=",
[AD_CATEGORY.FLAT.id]: "&input_kategorija=15",
[AD_CATEGORY.HOUSE.id]: "&input_kategorija=9",
[AD_CATEGORY.LAND.id]: "&input_kategorija=5", //3 and 4 also gradjevinsko
[AD_CATEGORY.OFFICE.id]: "&input_kategorija=8",
[AD_CATEGORY.APARTMENT.id]: "&input_kategorija=1",
[AD_CATEGORY.GARAGE.id]: "&input_kategorija=2"
//[AD_CATEGORY.COTTAGE.id]: ""
}
};
class SaljicCrawler {
constructor(
savers = [],
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE],
maxPages = 5000,
maxResultsPerPage = 5000,
ignoredUsernames = [],
delayBetweenPages = 1000
) {
this.savers = savers;
this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search";
this.crawlerAdTypes = crawlerAdTypes;
this.crawlerAdCategories = crawlerAdCategories;
this.maxResultsPerPage = maxResultsPerPage;
this.delayBetweenPages = delayBetweenPages;
}
async crawl() {
const crawlAdCategories = this.crawlerAdCategories;
const newRealEstates = [];
if (crawlAdCategories) {
const indexGenerators = [];
for (const adCategory of crawlAdCategories) {
indexGenerators.push(this.categoryIndexer(adCategory));
}
//
//console.log(indexGenerators);
//
let done = false;
while (!done) {
const categoryIndexerPromises = [];
const generatorsToRemove = [];
for (const indexGenerator of indexGenerators) {
categoryIndexerPromises.push(indexGenerator.next());
generatorsToRemove.push(false);
}
const singlePageResults = await Promise.all(categoryIndexerPromises);
const entries = singlePageResults.entries();
for (const [index, { value: singlePageResult }] of entries) {
if (singlePageResult) {
const saveResults = await this.saveCrawledResults(singlePageResult);
const { newRecords } = saveResults;
newRealEstates.push(...newRecords);
if (
Array.isArray(newRecords) &&
newRecords.length === 0 &&
!SALJIC_FORCE_CRAWL
) {
generatorsToRemove[index] = true;
}
} else {
//Generator returned undefined, remove this generator from array
generatorsToRemove[index] = true;
// console.log("Generator ", index + 1, "has no more pages");
}
}
// console.log("Generators state : ", generatorsToRemove);
for (let i = generatorsToRemove.length - 1; i >= 0; i--) {
if (generatorsToRemove[i]) {
// console.log("\tRemove generator ", i + 1);
indexGenerators.splice(i, 1);
}
}
if (indexGenerators.length === 0) {
done = true;
}
await this.sleep(this.delayBetweenPages);
}
}
return newRealEstates;
}
async *categoryIndexer(adCategory) {
let pageToIndex = 1;
const urlAdTypePart = SALJIC_ENUMS.SALJIC_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = SALJIC_ENUMS.SALJIC_AD_CATEGORY[adCategory];
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
while (true) {
const urlPagePart = pageToIndex === 1 ? "" : (pageToIndex - 1) * 2 * 11;
const urlPageToCrawl = `${this.baseUrl}?order_by=${urlAdTypePart}${urlCategoryPart}&per_page=${urlPagePart}`;
const singlePageResults = await this.indexSinglePage(
urlPageToCrawl,
this.maxResultsPerPage
);
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
yield singlePageResults;
} else {
return undefined;
}
++pageToIndex;
if (pageToIndex === this.maxPages) {
return undefined;
}
}
} else {
return undefined;
}
}
async indexSinglePage(url, maxResultsPerPage) {
if (PRINT_CRAWLER_DEBUG) {
console.log("[SALJIC] Index page : ", url);
}
try {
const res = await fetch(url);
const body = await res.text();
const $ = cheerio.load(body);
let hrefs = [];
$("#shop")
.find(".product")
.each((i, elem) => {
const href = $(elem)
.find("a")
.first()
.attr("href");
if (href) {
hrefs.push(href);
}
});
let adTypesTmp = [];
$("#shop")
.find(".product")
.each((i, elem) => {
const adType = $(elem)
.find(".trakica-search-page")
.text()
.trim();
if (adType) {
adTypesTmp.push(adType);
}
});
//Converting to AD_TYPE
const adTypes = adTypesTmp.map(adTypeText => {
return this.getAdTypeId(adTypeText);
});
//Converting to absolute URLs
const hrefsAbs = hrefs.map(link => {
return "https://www.saljicnekretnine.ba" + link;
});
let actualNoOfResults =
hrefsAbs.length <= maxResultsPerPage
? hrefsAbs.length
: maxResultsPerPage;
const asyncScraping = [];
for (let i = 0; i < actualNoOfResults; i++) {
asyncScraping.push(this.scrapeAd(hrefsAbs[i], adTypes[i]));
}
const scrapedData = await Promise.all(asyncScraping);
const filteredScrapedData = scrapedData.filter(adData => !!adData);
return filteredScrapedData;
} catch (e) {
console.error("[SALJIC] Exception caught:" + e);
return [];
}
}
async scrapeAd(url, adType) {
// console.log("[SALJIC] Scraping : ", url);
try {
const adPageSource = await fetch(url);
const body = await adPageSource.text();
const $ = cheerio.load(body);
// No information for status ex. PRODAN
const status = AD_STATUS.STATUS_NORMAL;
//Extracting agency ID from url
const agencyObjectId = parseInt(url.substring(46, url.length));
//Extracting main properties
const propertySelectors = {
title:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-title > h2",
price:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins",
streetName:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p",
descriptions:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)",
latAndLong:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe"
};
const title = $(propertySelectors.title)
.text()
.replace(/(\r\n|\n|\r)/gm, "")
.replace(/ {1,}/g, " ")
.trim();
const priceText = $(propertySelectors.price)
.text()
.replace(/(\r\n|\n|\r)/gm, "")
.replace(/ {1,}/g, " ")
.trim();
const price =
priceText === "CIJENA NA UPIT"
? null
: parseFloat(
priceText.substring(8, priceText.length - 3).replace(",", "")
);
const streetName = $(propertySelectors.streetName)
.text()
.replace(/(\r\n|\n|\r)/gm, "")
.trim();
const descriptions = $(propertySelectors.descriptions)
.text()
.replace(/\"/g, "")
.trim();
const latAndLongSrc = $(propertySelectors.latAndLong).attr("src");
const latText = latAndLongSrc.substring(
latAndLongSrc.indexOf("marker=") + 7,
latAndLongSrc.indexOf("%2C", latAndLongSrc.indexOf("marker="))
);
const longText = latAndLongSrc.substring(
latAndLongSrc.indexOf("%2C", latAndLongSrc.indexOf("marker=")) + 3,
latAndLongSrc.length
);
const locationLat = parseFloat(latText) || null;
const locationLong = parseFloat(longText) || null;
//====== DETAIL INFORMATION FIELDS ==========
let area = null,
gardenSize = null,
numberOfRooms = null,
numberOfFloors = null,
floor = null,
accessRoadType = null,
heatingType = null,
furnishingType = null,
balcony = null,
newBuilding = null,
elevator = null,
water = null,
electricity = null,
drainageSystem = null,
registeredInZkBooks = null,
recentlyAdapted = null,
parking = null,
garage = null,
gas = null,
antiTheftDoor = null,
airCondition = null,
phoneConnection = null,
cableTV = null,
internet = null,
basementAttic = null,
storeRoom = null,
videoSurveillance = null,
alarm = null,
suitableForStudents = null,
includingBills = null,
animalsAllowed = null,
pool = null,
exchange = null,
urbanPlanPermit = null,
buildingPermit = null,
utilityConnection = null,
distanceToRiver = null;
let publishedDate = null;
let renewedDate = null;
let realEstateType;
let numberOfViewsAgency = null;
let numberOfViewsKivi = null;
let streetNumber = 0;
let adStatus = status;
let shortDescription = descriptions.substring(
0,
descriptions.indexOf(".")
);
let longDescription = descriptions;
//Extracting data - Glavne karakteristike
let mainFieldIndex = 1;
do {
const mainFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.list-group-item:nth-child(${mainFieldIndex})`;
const mainField = $(mainFieldSelector)
.text()
.replace(/[\n\r\t]/gm, "")
.trim();
const mainFieldTitle = mainField.substring(0, mainField.indexOf(" "));
const mainFieldValue = mainField
.substring(mainField.indexOf(" "), mainField.length)
.trim();
switch (mainFieldTitle) {
case "Površina":
area = parseFloat(
mainFieldValue.substring(0, mainFieldValue.indexOf(" "))
);
break;
case "Okućnica":
gardenSize = parseFloat(
mainFieldValue.substring(0, mainFieldValue.indexOf(" "))
);
break;
case "Broj soba":
numberOfRooms = parseInt(mainFieldValue);
break;
case "Broj spratova":
numberOfFloors = parseInt(mainFieldValue);
break;
case "Sprat":
floor = parseInt(mainFieldValue);
break;
case "Godina renoviranja":
recentlyAdapted = true;
break;
case "Broj parking mjesta":
parking = true;
break;
case "Dostupno od":
const day = mainFieldValue.substring(0, 2);
const month = mainFieldValue.substring(3, 5);
const year = mainFieldValue.substring(6, mainFieldValue.length);
publishedDate = new Date(`${month}/${day}/${year}`);
break;
default:
break;
}
if (mainFieldTitle === "") {
break;
}
mainFieldIndex++;
} while (true);
//Extracting data - Sadrzaji
let additionalFieldIndex = 1;
do {
const additionalFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.border-color.col-md-5.col-md-offset-1.col-md-pull-1.list-group-item-bottom:nth-child(${additionalFieldIndex})`;
const additionalField = $(additionalFieldSelector)
.text()
.trim();
if (additionalFieldIndex === 1) {
//Extracting data of real estate type
const categoryTmp = additionalField
.replace(/[\n\r\t]/gm, "")
.substring(
additionalField.indexOf("Kategorija") + 10,
additionalField.length
)
.trim();
realEstateType = this.getAdCategoryId(categoryTmp);
} else {
switch (additionalField) {
case "Internet":
internet = true;
break;
case "Garaža":
garage = true;
break;
case "Klima":
airCondition = true;
break;
case "Balkon":
balcony = true;
break;
case "Ostava":
storeRoom = true;
break;
case "Podrum":
basementAttic = true;
break;
case "Blindirana vrata":
antiTheftDoor = true;
break;
case "Voda":
water = true;
break;
case "Kablovska":
cableTV = true;
break;
case "Uknjiženo":
registeredInZkBooks = true;
break;
case "Grijanje - centralno":
heatingType = HEATING_TYPE.CENTRAL_CITY.id;
break;
case "Grijanje - plin":
heatingType = HEATING_TYPE.GAS.id;
break;
case "Grijanje - struja":
heatingType = HEATING_TYPE.ELECTRICITY.id;
break;
case "Grijanje":
heatingType = HEATING_TYPE.OTHER.id;
break;
case "Plin":
gas = true;
break;
case "Namješten":
furnishingType = FURNISHING_TYPE.FURNISHED.id;
break;
case "Alarm":
alarm = true;
break;
case "Video nadzor":
videoSurveillance = true;
break;
case "Lift":
elevator = true;
break;
case "Novogradnja":
newBuilding = true;
break;
default:
break;
}
}
if (additionalField === "") {
break;
}
additionalFieldIndex++;
} while (true);
//If no published date it takes current date of crawling
if (publishedDate) {
renewedDate = new Date();
} else {
publishedDate = new Date();
renewedDate = new Date();
}
const originAgencyName = AD_AGENCY.SALJIC;
const locality = "";
const municipality = "";
const city = "";
const region = "";
const entity = "";
const country = "";
const data = {
url,
agencyObjectId,
originAgencyName,
realEstateType,
adType,
title,
price,
area,
gardenSize,
shortDescription,
longDescription,
streetNumber,
streetName,
locality,
municipality,
city,
region,
entity,
country,
locationLat,
locationLong,
adStatus,
publishedDate,
renewedDate,
numberOfRooms,
numberOfFloors,
floor,
accessRoadType,
heatingType,
furnishingType,
balcony,
newBuilding,
elevator,
water,
electricity,
drainageSystem,
registeredInZkBooks,
recentlyAdapted,
parking,
garage,
gas,
antiTheftDoor,
airCondition,
phoneConnection,
cableTV,
internet,
basementAttic,
storeRoom,
videoSurveillance,
alarm,
suitableForStudents,
includingBills,
animalsAllowed,
pool,
exchange,
urbanPlanPermit,
buildingPermit,
utilityConnection,
distanceToRiver,
numberOfViewsAgency,
numberOfViewsKivi
};
return data;
} catch (e) {
console.error("Exception caught: " + e.message, "\r\nURL:", url);
}
return null;
}
//======= HELPER FUNCTIONS =============
getAdCategoryId(categoryText) {
switch (categoryText) {
case "Stan":
return AD_CATEGORY.FLAT.id;
case "Građevinsko zemljiste":
return AD_CATEGORY.LAND.id;
case "Industrijsko zemljiste":
return AD_CATEGORY.LAND.id;
case "Poljoprivredno zemljiste":
return AD_CATEGORY.LAND.id;
case "Kuća":
return AD_CATEGORY.HOUSE.id;
case "Poslovni prostor":
return AD_CATEGORY.OFFICE.id;
case "Kancelarije":
return AD_CATEGORY.OFFICE.id;
case "Apartmani":
return AD_CATEGORY.APARTMENT.id;
case "Garaža":
return AD_CATEGORY.GARAGE.id;
case "Vikendica":
return AD_CATEGORY.COTTAGE.id;
default:
return undefined;
}
}
getAdTypeId(adTypeText) {
switch (adTypeText) {
case "PRODAJA":
return AD_TYPE.AD_TYPE_SALE.stringId;
case "NAJAM":
return AD_TYPE.AD_TYPE_RENT.stringId;
default:
return undefined;
}
}
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async saveCrawledResults(results) {
const savers = this.savers;
// for (const saver of savers) {
// await saver.save(results);
// }
//For now, we use only Postgres saver, so ...
return savers[0].save(results);
//so that we can use some sequelize options and information when data is inserted
}
}
module.exports = SaljicCrawler;