Compare crawler results with db, and only save new if necessary

This commit is contained in:
Nedim Uka
2019-06-21 16:48:19 +02:00
parent 80ff9bcb6b
commit 2f474619ca
4 changed files with 86 additions and 203 deletions

View File

@@ -3,9 +3,10 @@ const cheerio = require('cheerio');
const { allRERequest, findPointInsideBoundingBox } = require('../url');
const { getRealEstateTypeEnum } = require('../enums');
const { getRegion, getMunicipality } = require('../codes')
const Promise = require("bluebird");
module.exports = class OlxCrawler {
//TODO figure best way to handle paging
constructor(fromPage = 0, toPage = 10, maxResults = 1000) {
this.fromPage = fromPage;
this.toPage = toPage;
@@ -18,19 +19,22 @@ module.exports = class OlxCrawler {
const body = await res.text();
const $ = cheerio.load(body);
//TODO figure out what to do with username
const username = $('#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span').text();
// if (IGNORED_USERNAMES.includes((username || '').toLowerCase())) {
// return null;
// }
//TODO remove properties that are not needed, and add some if they are missing
const title = $('#naslovartikla').text();
const category = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span').text();
const realEstateType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span').text();
const price = $('#pc > p:nth-child(2)').text();
const size = $('#dodatnapolja1 > div:nth-child(1) > div.df2').text();
const rooms = $('#dodatnapolja1 > div:nth-child(2) > div.df2').text();
const address = $('#dodatnapolja1 > div:nth-child(5) > div.df2').text();
const gardenSize = $('#dodatnapolja1 > div:nth-child(6) > div.df2').text();
const location = $('#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija').attr('data-content');
const adType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(2) > div.df2').text();
@@ -45,19 +49,22 @@ module.exports = class OlxCrawler {
let lng = '',
lat = '';
const images = [];
const imgMatches = body.match(imgRe);
const parseRooms = (rooms) => parseInt([...rooms].filter(c => !isNaN(c)).filter(c => c.trim()).join())
const parsePrice = (price) => parseFloat(price.replace(".", ""))
for (let i = 0; imgMatches && i < imgMatches.length; i++) {
let img = imgMatches[i].replace("href\":", "")
img = img.replace("\"", "");
img = img.replace("\"", "");
images.push(img);
}
// TODO we dont save images ??
// const images = [];
// const imgMatches = body.match(imgRe);
// for (let i = 0; imgMatches && i < imgMatches.length; i++) {
// let img = imgMatches[i].replace("href\":", "")
// img = img.replace("\"", "");
// img = img.replace("\"", "");
// images.push(img);
// }
// const uploadPromises = images.map(img => {
// const imgFixed = eval(`'${img}'`);
@@ -77,18 +84,17 @@ module.exports = class OlxCrawler {
const locationArray = location.split(",");
const region = locationArray[0];
const municipality = locationArray[1];
console.log(location);
console.log(locationArray);
const data = {
// category: this.getCategoryId(category),
realEstateType: this.getCategoryId(realEstateType),
email : email,
olxId: olxId,
category: category,
// category: category,
url,
title,
price: isNaN(parsedPrice) ? price : parsedPrice,
size: parseFloat(size),
gardenSize: parseFloat(gardenSize),
address,
region,
municipality,
@@ -145,21 +151,15 @@ module.exports = class OlxCrawler {
}
}
// getCategoryId (category) {
// if (category === 'Stanovi') {
// return CATEGORY_FLAT;
// } else if (category === 'Zemljišta') {
// return CATEGORY_LAND;
// } else if (category === 'Kuće') {
// return CATEGORY_HOUSE;
// } else if (category === 'Poslovni prostori') {
// return CATEGORY_OFFICE;
// }
// }
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
getCategoryId (category) {
if (category === 'Stanovi') {
return 'stan';
} else if (category === 'Vikendice') {
return 'vikendica';
} else if (category === 'Kuće') {
return 'kuca';
}
}
async indexPages(urls, start, end, maxResults = 1000) {
// let results = {};