Files
old-web/app/helpers/crawlers/olxClawler.js
2019-09-06 12:01:25 +02:00

365 lines
9.9 KiB
JavaScript

const fetch = require("node-fetch");
const cheerio = require("cheerio");
const { allRERequest, findPointInsideBoundingBox } = require("../db/dbHelper");
const { getRealEstateTypeEnum } = require("../enums");
const { getRegion, getMunicipality } = require("../codes");
const Promise = require("bluebird");
module.exports = class OlxCrawler {
//TODO figure best way to handle paging
constructor(hrefs = []) {
this.hrefs = hrefs;
}
async indexPages(urls) {
const indexers = [];
urls.forEach(url => {
indexers.push(new Indexer(url));
});
return Promise.map(indexers, function(indexer) {
return indexer.indexWithPagination();
}).then(async results => {
return results;
});
}
async crawl() {
const filteredResults = [];
const realestateRequests = await allRERequest();
const urls = this.createRequestUrls(realestateRequests);
let results = await this.indexPages(
urls,
this.fromPage,
this.toPage,
this.maxResults
);
const flatResults = results.flat();
if (flatResults) {
for (const finalResult of flatResults) {
if (null !== finalResult) {
if (
finalResult.lat !== undefined &&
finalResult.lat !== null &&
finalResult.lat !== ""
) {
const pointInsideBoundingBox = await findPointInsideBoundingBox(
[finalResult.lng, finalResult.lat],
finalResult.email,
finalResult.uuid
);
if (pointInsideBoundingBox[0].length !== 0) {
finalResult.hasLocation = true;
filteredResults.push(finalResult);
} else {
finalResult.hasLocation = false;
filteredResults.push(finalResult);
}
}
}
}
return filteredResults;
}
return [];
}
createRequestUrls(realestateRequests) {
const urls = [];
for (const request of realestateRequests) {
const realsestateType =
"kategorija=" +
getRealEstateTypeEnum(request.realEstateType).olxCategory;
const region = "kanton=" + getRegion(request.region).olxid;
const municipality =
"grad%5B%5D=" +
getMunicipality(request.region, request.municipality).olxid;
const sizeMin = "kvadrata_min=" + request.sizeMin;
const sizeMax = "kvadrata_max=" + request.sizeMax;
const priceMin = "od=" + request.priceMin;
const priceMax = "do=" + request.priceMax;
const olxUrl = {
url: `https://www.olx.ba/pretraga?${realsestateType}&id=2&stanje=0&vrstapregleda=tabela&sort_order=desc&${region}&${municipality}&${priceMin}&${priceMax}&vrsta=samoprodaja&${sizeMin}&${sizeMax}&stranica=`,
email: request.email,
uuid: request.uniqueId,
hrefs: this.hrefs
};
urls.push(olxUrl);
}
return urls;
}
};
class Indexer {
/**
*
* @param {String|Array} olxUrl single or array of objects containing url email and uuid
* @param {Array} hrefResutls array contaning urls from crawler results
*/
constructor(olxUrl, hrefResutls) {
this.olxUrl = olxUrl;
this.hrefResutls = hrefResutls;
}
async indexWithPagination(pageNumber = 1) {
const pageNr = this.olxUrl.url.match(/\d+$/);
const indexers = this.prepareIndexers(pageNumber ? [pageNumber] : pageNr);
try {
return Promise.map(indexers.indexers, function(indexer) {
return indexer.indexPage(pageNumber);
}).then(async results => {
let hasResults = false;
results.forEach(result => {
if (!hasResults) {
hasResults = result.hasResults;
}
});
if (!hasResults) {
const singlePageIndexers = this.prepareHrefIndexers(results);
if (singlePageIndexers.length === 0) {
return [];
}
return Promise.map(singlePageIndexers, function(indexer) {
return indexer.indexSingle();
}).then(async results => {
return results;
});
} else {
const newResults = await this.indexWithPagination(
results[0].pageNumber + 5
);
const singlePageIndexers = this.prepareHrefIndexers(results);
const newerResults = await Promise.map(singlePageIndexers, function(
indexer
) {
return indexer.indexSingle();
}).then(async results => {
return results;
});
Array.prototype.push.apply(newResults, newerResults);
return newResults;
}
});
} catch (e) {
console.error("Error has accured", e);
}
}
prepareIndexers(pageNr) {
const indexers = [];
let lastPageNumber;
if (pageNr) {
for (
let index = Number(pageNr[0]);
index <= Number(pageNr[0]) + 5;
index++
) {
lastPageNumber = index;
const newOlxUrl = {
url: this.olxUrl.url.replace(/\d+$/, "") + index,
email: this.olxUrl.email,
uuid: this.olxUrl.uuid,
hrefs: this.olxUrl.hrefs
};
indexers.push(new Indexer(newOlxUrl));
}
} else {
for (let index = 1; index <= 5; index++) {
lastPageNumber = index;
const newOlxUrl = {
url: this.olxUrl.url + index,
email: this.olxUrl.email,
uuid: this.olxUrl.uuid,
hrefs: this.olxUrl.hrefs
};
indexers.push(new Indexer(newOlxUrl));
}
}
return {
indexers: indexers,
lastPageNumber: lastPageNumber
};
}
prepareHrefIndexers(results) {
const indexers = [];
if (!Array.isArray(results)) {
results.hrefs.forEach(href => {
const newOlxUrl = {
url: href,
email: results.olxUrl.email,
uuid: results.olxUrl.uuid,
hrefs: this.olxUrl.hrefs
};
indexers.push(new Indexer(newOlxUrl));
});
} else {
results.forEach(result => {
if (result !== null && result.hasOwnProperty("hrefs")) {
result.hrefs.forEach(href => {
const newOlxUrl = {
url: href,
email: result.olxUrl.email,
uuid: result.olxUrl.uuid,
hrefs: this.olxUrl.hrefs
};
indexers.push(new Indexer(newOlxUrl));
});
}
});
}
return indexers;
}
async indexPage(pageNumber) {
try {
const res = await fetch(this.olxUrl.url);
const body = await res.text();
const $ = cheerio.load(body);
const hrefs = [];
let hasResults = false;
$("#rezultatipretrage")
.find(".listitem")
.each((i, elem) => {
hasResults = true;
const href = $(elem)
.find("a")
.first()
.attr("href");
hrefs.push(href);
});
return {
hrefs: hrefs,
hasResults: hasResults,
pageNumber: pageNumber,
olxUrl: this.olxUrl
};
} catch (e) {
console.error("Exception caught:" + e);
}
}
async indexSingle() {
try {
if (this.olxUrl.url === undefined) {
return {};
}
// if (global.hrefs) {
if (
this.olxUrl.hrefs[this.olxUrl.uuid] &&
this.olxUrl.hrefs[this.olxUrl.uuid].includes(this.olxUrl.url)
) {
return null;
}
// }
const res = await fetch(this.olxUrl.url);
const body = await res.text();
const $ = cheerio.load(body);
const title = $("#naslovartikla")
.text()
.trim();
const realEstateType = $(
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
).text();
const price = $("#pc > p:nth-child(2)").text();
const size = $("#dodatnapolja1 > div:nth-child(1) > div.df2").text();
const rooms = $("#dodatnapolja1 > div:nth-child(2) > div.df2").text();
const address = $("#dodatnapolja1 > div:nth-child(5) > div.df2").text();
const gardenSize = $(
"#dodatnapolja1 > div:nth-child(6) > div.df2"
).text();
const location = $(
"#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija"
).attr("data-content");
const time = $("time").attr("datetime");
const olxId = $(
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2"
).text();
const descriptions = $(".artikal_detaljniopis_tekst");
const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g;
const imgRe = /href":("[^"]*")/g;
const matches = latLngRe.exec(body);
let lng = "",
lat = "";
const parsePrice = price => parseFloat(price.replace(".", ""));
if (matches && matches.length >= 3) {
lat = matches[1];
lng = matches[2];
}
const parsedPrice = parsePrice(price);
const locationArray = location.split(",");
const region = locationArray[0];
const municipality = locationArray[1];
const data = {
realEstateType: this.getCategoryId(realEstateType),
email: this.olxUrl.email,
uuid: this.olxUrl.uuid,
olxId: olxId,
url: this.olxUrl.url,
title,
price: isNaN(parsedPrice) ? 0 : parsedPrice,
size: parseFloat(size),
gardenSize: isNaN(parseFloat(gardenSize)) ? 0 : parseFloat(gardenSize),
address,
region,
municipality,
time,
shortDescription: descriptions.first().text(),
longDescription: descriptions.last().text(),
lat,
lng,
loc: [parseFloat(lat), parseFloat(lng)]
};
return data;
} catch (e) {
console.error("Exception caught: " + e.message);
}
return null;
}
getCategoryId(category) {
switch (category) {
case "Stanovi":
return "stan";
case "Vikendice":
return "vikendica";
case "Kuće":
return "kuca";
default:
return "";
}
}
}