Correctly parse rooms, size, price etc.

This commit is contained in:
Edin Dazdarevic
2017-04-04 02:16:22 +02:00
parent 80aded2fc3
commit a6c5ee80e8
7 changed files with 177 additions and 39 deletions

View File

@@ -61,7 +61,7 @@
function _asyncToGenerator(fn) { return function () { var gen = fn.apply(this, arguments); return new Promise(function (resolve, reject) { function step(key, arg) { try { var info = gen[key](arg); var value = info.value; } catch (error) { reject(error); return; } if (info.done) { resolve(value); } else { return Promise.resolve(value).then(function (value) { step("next", value); }, function (err) { step("throw", err); }); } } return step("next"); }); }; }
var MongoClient = __webpack_require__(3).MongoClient;
var url = 'mongodb://localhost:27017/example';
var url = 'mongodb://localhost:27017/kivi';
__webpack_require__(4);
@@ -70,13 +70,16 @@
var PORT = process.env.PORT || 3001;
var AGENTURA_KEY = process.env.AGENTURA_KEY || '1somethingverysecret';
var db = void 0;
//Monogo = await MongoClient.connect(url);
// TODO:
// db.results.ensureIndex({loc:"2d"})
//collection.ensureIndex("username",callback)
router.get('/search', function () {
var _ref = _asyncToGenerator(regeneratorRuntime.mark(function _callee(req, res, next) {
var bounds, db, properties, query, _bounds$split$map, _bounds$split$map2, lat1, lng1, lat2, lng2, box, all;
var bounds, minPrice, maxPrice, minSize, maxSize, rooms, adType, properties, query, _bounds$split$map, _bounds$split$map2, lat1, lng1, lat2, lng2, box, all;
return regeneratorRuntime.wrap(function _callee$(_context) {
while (1) {
@@ -84,12 +87,13 @@
case 0:
_context.prev = 0;
bounds = req.query.bounds || '';
_context.next = 4;
return MongoClient.connect(url);
case 4:
db = _context.sent;
properties = db.collection('results');
minPrice = req.query.minPrice;
maxPrice = req.query.maxPrice;
minSize = req.query.minSize;
maxSize = req.query.maxSize;
rooms = req.query.rooms;
adType = req.query.adType;
properties = db.collection('listings');
query = {};
@@ -107,35 +111,83 @@
});
}
_context.next = 10;
if (adType) {
query = Object.assign(query, {
adType: parseInt(adType)
});
}
if (minPrice) {
query = Object.assign(query, {
price: {
"$gte": parseFloat(minPrice),
"$ne": -1
}
});
}
if (maxPrice) {
query = Object.assign(query, {
price: {
"$lte": parseFloat(maxPrice),
"$ne": -1
}
});
}
if (rooms === "4+") {
query = Object.assign(query, {
rooms: {
"$gte": 4
}
});
} else if (rooms) {
query = Object.assign(query, {
rooms: parseFloat(rooms)
});
}
if (minSize) {
query = Object.assign(query, {
size: {
"$gte": parseFloat(minSize)
}
});
}
if (maxSize) {
query = Object.assign(query, {
size: {
"$lte": parseFloat(maxSize)
}
});
}
_context.next = 19;
return properties.find(query).toArray();
case 10:
case 19:
all = _context.sent;
res.json(all);
res.end();
_context.next = 15;
return db.close();
case 15:
_context.next = 21;
_context.next = 28;
break;
case 17:
_context.prev = 17;
case 24:
_context.prev = 24;
_context.t0 = _context['catch'](0);
console.log('error:', _context.t0);
next(_context.t0);
case 21:
case 28:
case 'end':
return _context.stop();
}
}
}, _callee, undefined, [[0, 17]]);
}, _callee, undefined, [[0, 24]]);
}));
return function (_x, _x2, _x3) {
@@ -155,8 +207,13 @@
});
app.use('/api', router);
app.listen(PORT, function () {
return console.log('Express server running at localhost: ' + PORT);
MongoClient.connect(url).then(function (database) {
db = database;
db.collection('listings').createIndex({ loc: "2d" });
app.listen(PORT, function () {
return console.log('Express server running at localhost: ' + PORT);
});
});
/***/ },

View File

@@ -1,7 +1,7 @@
import express from 'express'
import bodyParser from 'body-parser';
var MongoClient = require('mongodb').MongoClient;
var url = 'mongodb://localhost:27017/example';
var url = 'mongodb://localhost:27017/kivi';
require("babel-polyfill");
@@ -10,6 +10,9 @@ const router = express.Router({mergeParams: true})
const PORT = process.env.PORT || 3001;
const AGENTURA_KEY = process.env.AGENTURA_KEY || '1somethingverysecret';
let db;
//Monogo = await MongoClient.connect(url);
// TODO:
// db.results.ensureIndex({loc:"2d"})
//collection.ensureIndex("username",callback)
@@ -17,8 +20,13 @@ const AGENTURA_KEY = process.env.AGENTURA_KEY || '1somethingverysecret';
router.get('/search', async (req, res, next) => {
try {
const bounds = req.query.bounds || '';
const db = await MongoClient.connect(url);
const properties = db.collection('results');
const minPrice = req.query.minPrice;
const maxPrice = req.query.maxPrice;
const minSize = req.query.minSize;
const maxSize = req.query.maxSize;
const rooms = req.query.rooms;
const adType = req.query.adType;
const properties = db.collection('listings');
let query = {};
if (bounds) {
@@ -34,11 +42,62 @@ router.get('/search', async (req, res, next) => {
});
}
if (adType) {
query = Object.assign(query, {
adType: parseInt(adType)
});
}
if (minPrice) {
query = Object.assign(query, {
price: {
"$gte": parseFloat(minPrice),
"$ne": -1
}
});
}
if (maxPrice) {
query = Object.assign(query, {
price: {
"$lte": parseFloat(maxPrice),
"$ne": -1
}
});
}
if (rooms === "4+") {
query = Object.assign(query, {
rooms: {
"$gte": 4
}
})
} else if (rooms) {
query = Object.assign(query, {
rooms: parseFloat(rooms)
});
}
if (minSize) {
query = Object.assign(query, {
size: {
"$gte": parseFloat(minSize)
}
});
}
if (maxSize) {
query = Object.assign(query, {
size: {
"$lte": parseFloat(maxSize)
}
});
}
const all = await properties.find(query).toArray();
res.json(all);
res.end();
await db.close();
} catch (e) {
console.log('error:', e);
next(e);
@@ -58,5 +117,10 @@ app.use(function(req, res, next) {
});
app.use('/api', router);
app.listen(PORT, () => console.log('Express server running at localhost: ' + PORT));
MongoClient.connect(url).then((database) => {
db = database;
db.collection('listings').createIndex({loc: "2d"});
app.listen(PORT, () => console.log('Express server running at localhost: ' + PORT));
});

5
crawler/enums.js Normal file
View File

@@ -0,0 +1,5 @@
export const AD_TYPE_SALE = 1;
export const AD_TYPE_RENT = 2;
export const IGNORED_USERNAMES = ['rental']

View File

@@ -17,7 +17,7 @@ export default class MongoSaver {
} else {
console.log('Connection established to', this.url);
saver.db = db;
saver.collection = db.collection('results');
saver.collection = db.collection('listings');
saver.ready = true;
resolve();
}

View File

@@ -3,6 +3,7 @@
let fetch = require('node-fetch');
let cheerio = require('cheerio');
let fs = require('fs');
import {AD_TYPE_SALE, IGNORED_USERNAMES} from '../enums';
export default class OlxCrawler {
@@ -18,6 +19,12 @@ export default class OlxCrawler {
const body = await res.text();
const $ = cheerio.load(body);
const username = $('#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span').text();
if (IGNORED_USERNAMES.includes((username || '').toLowerCase())) {
return null;
}
const title = $('#naslovartikla').text();
const price = $('#pc > p:nth-child(2)').text();
const size = $('#dodatnapolja1 > div:nth-child(1) > div.df2').text();
@@ -30,6 +37,7 @@ export default class OlxCrawler {
const olxId = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2').text();
const descriptions = $('.artikal_detaljniopis_tekst');
const floor = $('#dodatnapolja1').find(':contains(Sprat)').last().nextAll().text();
const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g;
const imgRe = /href":("[^"]*")/g;
const matches = latLngRe.exec(body);
@@ -39,6 +47,10 @@ export default class OlxCrawler {
const images = [];
const imgMatches = body.match(imgRe);
const parseRooms = (rooms) => parseInt([...rooms].filter(c => !isNaN(c)).filter(c => c.trim()).join())
const parsePrice = (price) => parseFloat(price.replace(".", ""))
for (let i = 0; imgMatches && i < imgMatches.length; i++) {
let img = imgMatches[i].replace("href\":", "")
img = img.replace("\"", "");
@@ -54,14 +66,14 @@ export default class OlxCrawler {
const data = {
url,
title,
price,
size,
rooms,
price: parsePrice(price) || -1,
size: parseFloat(size),
rooms: parseRooms(rooms),
floor: parseInt(floor),
address,
location,
adType,
adType: AD_TYPE_SALE,
time,
olxId,
shortDescription: descriptions.first().text(),
longDescription: descriptions.last().text(),
lat,
@@ -81,7 +93,7 @@ export default class OlxCrawler {
async indexPage(pageNr, maxResults = 1000) {
try {
console.log('Starting to index page: ' + pageNr);
const url = `http://www.olx.ba/pretraga?vrsta=samoizdavanje&sort_order=desc&kategorija=23&sort_po=datum&kanton=9&stranica=${pageNr}`;
const url = `http://www.olx.ba/pretraga?vrsta=samoprodaja&sort_order=desc&kategorija=23&sort_po=datum&kanton=9&stranica=${pageNr}`;
const res = await fetch(url);
const body = await res.text();

View File

@@ -48,7 +48,7 @@ export default class ListingDetails extends React.Component {
imageIndex={this.props.imageIndex} />
<div className="ld-price-address-box">
<div className="ld-price">
{listing.price}
{listing.price.toLocaleString('bs')} KM
</div>
<div className="ld-address">
@@ -60,7 +60,7 @@ export default class ListingDetails extends React.Component {
<div className="ld-features">
<div className="ld-feature-box">
<i className="fa fa-bed"></i>
{listing.rooms}
{listing.rooms} sobe
</div>
<div className="ld-feature-box">
<i className="fa fa-home"></i>
@@ -68,11 +68,11 @@ export default class ListingDetails extends React.Component {
</div>
<div className="ld-feature-box">
<i className="fa fa-home"></i>
1. sprat
{listing.floor}. sprat
</div>
<div className="ld-feature-box">
<i className="fa fa-home"></i>
Balkon
--
</div>
</div>
<div className="ld-check-availability">

View File

@@ -24,8 +24,8 @@ export default class Listings extends React.Component {
<img src={images[0]} alt=""></img>
</div>
<div className="pli-details">
<div className="price">{l.price}</div>
<div className="description">{l.rooms ? `${l.rooms}, `: null}{l.size ? `${l.size}m2`: null}</div>
<div className="price">{l.price.toLocaleString('bs')} KM</div>
<div className="description">{l.rooms ? `${l.rooms} sobe, `: null}{l.size ? `${l.size}m2`: null}</div>
<div className="address">
<div className="street">
{l.address}