Merge branch 'add-crawler-for-prostor-page' into 'master'

Add crawler for prostor page

See merge request saburly/marketalarm/web!59
This commit was merged in pull request #59.
This commit is contained in:
Bilal Catic
2019-10-25 10:21:24 +00:00
13 changed files with 414 additions and 79 deletions

View File

@@ -70,6 +70,9 @@ const AD_TYPE = {
};
const AD_CATEGORY = {
ALL: {
id: "ALL"
},
FLAT: {
id: "FLAT",
title: "Stan",
@@ -134,7 +137,8 @@ const AD_STATUS = {
const AD_AGENCY = {
OLX: "OLX",
RENTAL: "RENTAL"
RENTAL: "RENTAL",
PROSTOR: "PROSTOR"
};
const CRAWLER_AD_TYPE = {

View File

@@ -28,6 +28,8 @@ const MAX_REAL_ESTATES_IN_EMAIL =
const MAX_REAL_ESTATES_IN_FIRST_EMAIL =
parseInt(process.env.MAX_REAL_ESTATES_IN_FIRST_EMAIL) || 5;
const PRINT_CRAWLER_DEBUG = process.env.PRINT_CRAWLER_DEBUG_INFO || 0;
module.exports = {
APP_PORT,
APP_URL,
@@ -36,5 +38,6 @@ module.exports = {
STOP_CRAWLER,
AWS_EMAIL_CONFIG,
MAX_REAL_ESTATES_IN_EMAIL,
MAX_REAL_ESTATES_IN_FIRST_EMAIL
MAX_REAL_ESTATES_IN_FIRST_EMAIL,
PRINT_CRAWLER_DEBUG
};

View File

@@ -5,9 +5,10 @@ const { AD_CATEGORY } = require("../common/enums");
const getRealEstateTypes = (req, res) => {
const title = "Koju nekretninu tražite?";
const realEstateTypes = Object.keys(AD_CATEGORY).map(
category => AD_CATEGORY[category]
);
const realEstateTypes = Object.keys(AD_CATEGORY)
.map(category => AD_CATEGORY[category])
.filter(category => category.title);
res.render("realEstateType", { realEstateTypes, title });
};

View File

@@ -5,10 +5,15 @@
All environment specific configuration is read here and
passed to the crawlers and savers.
*/
const OlxCrawler = require("./specific/olx");
const RentalCrawler = require("./specific/rental");
const OlxCrawler = require("./specificCrawlers/olx");
const RentalCrawler = require("./specificCrawlers/rental");
const ProstorCrawler = require("./specificCrawlers/prostor");
const { OLX_CONFIG, RENTAL_CONFIG } = require("./crawlerConfig");
const {
OLX_CONFIG,
RENTAL_CONFIG,
PROSTOR_CONFIG
} = require("./crawlerConfig");
const PostgresSaver = require("./savers/postgres");
async function crawlAll() {
@@ -32,6 +37,15 @@ async function crawlAll() {
RENTAL_CONFIG.RENTAL_MAX_RESULTS_PER_PAGE,
RENTAL_CONFIG.RENTAL_IGNORED_USERNAMES,
RENTAL_CONFIG.RENTAL_DELAY_BETWEEN_PAGES
),
new ProstorCrawler(
[postgresSaver],
PROSTOR_CONFIG.PROSTOR_CRAWLER_AD_TYPE,
PROSTOR_CONFIG.PROSTOR_CRAWLER_AD_CATEGORIES,
PROSTOR_CONFIG.PROSTOR_MAX_PAGES,
PROSTOR_CONFIG.PROSTOR_MAX_RESULTS_PER_PAGE,
PROSTOR_CONFIG.PROSTOR_IGNORED_USERNAMES,
PROSTOR_CONFIG.PROSTOR_DELAY_BETWEEN_PAGES
)
];

View File

@@ -1,74 +1,12 @@
"use strict";
require("dotenv").config({ path: __dirname + "/./../../.env" });
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../common/enums");
const olxCrawlerAdType =
process.env.OLX_CRAWLER_AD_TYPE !== undefined
? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE]
: null;
const rentalCrawlerAdType =
process.env.RENTAL_CRAWLER_AD_TYPE !== undefined
? CRAWLER_AD_TYPE[process.env.RENTAL_CRAWLER_AD_TYPE]
: null;
const olxParsedCrawlerAdCategories =
process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined
? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category =>
category.trim()
)
: ["FLAT", "HOUSE"];
const rentalParsedCrawlerAdCategories =
process.env.RENTAL_CRAWLER_AD_CATEGORIES !== undefined
? process.env.RENTAL_CRAWLER_AD_CATEGORIES.split(",").map(category =>
category.trim()
)
: ["FLAT", "HOUSE"];
const olxIgnoredUsernames =
process.env.OLX_IGNORED_USERNAMES !== undefined
? process.env.OLX_IGNORED_USERNAMES.split(",").map(username =>
username.trim()
)
: [];
const rentalIgnoredUsernames = [];
const transformedOlxCrawlerAdCategories = olxParsedCrawlerAdCategories
.map(categoryName =>
AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined
)
.filter(category => !!category);
const transformedRentalCrawlerAdCategories = rentalParsedCrawlerAdCategories
.map(categoryName =>
AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined
)
.filter(category => !!category);
const OLX_CONFIG = {
OLX_MAX_PAGES: parseInt(process.env.OLX_MAX_PAGES) || 500,
OLX_MAX_RESULTS_PER_PAGE:
parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50,
OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE,
OLX_CRAWLER_AD_CATEGORIES: transformedOlxCrawlerAdCategories,
OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [],
OLX_DELAY_BETWEEN_PAGES: parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000
};
const RENTAL_CONFIG = {
RENTAL_MAX_PAGES: parseInt(process.env.RENTAL_MAX_PAGES) || 500,
RENTAL_MAX_RESULTS_PER_PAGE:
parseInt(process.env.RENTAL_MAX_RESULTS_PER_PAGE) || 50,
RENTAL_CRAWLER_AD_TYPE: rentalCrawlerAdType || CRAWLER_AD_TYPE.NONE,
RENTAL_CRAWLER_AD_CATEGORIES: transformedRentalCrawlerAdCategories,
RENTAL_IGNORED_USERNAMES: rentalIgnoredUsernames || [],
RENTAL_DELAY_BETWEEN_PAGES:
parseInt(process.env.RENTAL_DELAY_BETWEEN_PAGES) || 1000
};
const OLX_CONFIG = require("./specificConfigs/olx");
const RENTAL_CONFIG = require("./specificConfigs/rental");
const PROSTOR_CONFIG = require("./specificConfigs/prostor");
module.exports = {
OLX_CONFIG,
RENTAL_CONFIG
RENTAL_CONFIG,
PROSTOR_CONFIG
};

View File

@@ -0,0 +1,37 @@
"use strict";
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums");
const olxCrawlerAdType =
process.env.OLX_CRAWLER_AD_TYPE !== undefined
? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE]
: null;
const olxParsedCrawlerAdCategories =
process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined
? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category =>
category.trim()
)
: ["FLAT", "HOUSE"];
const olxIgnoredUsernames =
process.env.OLX_IGNORED_USERNAMES !== undefined
? process.env.OLX_IGNORED_USERNAMES.split(",").map(username =>
username.trim()
)
: [];
const transformedOlxCrawlerAdCategories = olxParsedCrawlerAdCategories
.map(categoryName =>
AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined
)
.filter(category => !!category);
module.exports = {
OLX_MAX_PAGES: parseInt(process.env.OLX_MAX_PAGES) || 500,
OLX_MAX_RESULTS_PER_PAGE:
parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50,
OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE,
OLX_CRAWLER_AD_CATEGORIES: transformedOlxCrawlerAdCategories,
OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [],
OLX_DELAY_BETWEEN_PAGES: parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000
};

View File

@@ -0,0 +1,33 @@
"use strict";
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums");
const prostorCrawlerAdType =
process.env.PROSTOR_CRAWLER_AD_TYPE !== undefined
? CRAWLER_AD_TYPE[process.env.PROSTOR_CRAWLER_AD_TYPE]
: null;
const prostorParsedCrawlerAdCategories =
process.env.PROSTOR_CRAWLER_AD_CATEGORIES !== undefined
? process.env.PROSTOR_CRAWLER_AD_CATEGORIES.split(",").map(category =>
category.trim()
)
: ["FLAT", "HOUSE"];
const prostorIgnoredUsernames = [];
const transformedProstorCrawlerAdCategories = prostorParsedCrawlerAdCategories
.map(categoryName =>
AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined
)
.filter(category => !!category);
module.exports = {
PROSTOR_MAX_PAGES: parseInt(process.env.PROSTOR_MAX_PAGES) || 100,
PROSTOR_MAX_RESULTS_PER_PAGE:
parseInt(process.env.PROSTOR_MAX_RESULTS_PER_PAGE) || 5000,
PROSTOR_CRAWLER_AD_TYPE: prostorCrawlerAdType || CRAWLER_AD_TYPE.NONE,
PROSTOR_CRAWLER_AD_CATEGORIES: transformedProstorCrawlerAdCategories,
PROSTOR_IGNORED_USERNAMES: prostorIgnoredUsernames || [],
PROSTOR_DELAY_BETWEEN_PAGES:
parseInt(process.env.PROSTOR_DELAY_BETWEEN_PAGES) || 1000
};

View File

@@ -0,0 +1,33 @@
"use strict";
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums");
const rentalCrawlerAdType =
process.env.RENTAL_CRAWLER_AD_TYPE !== undefined
? CRAWLER_AD_TYPE[process.env.RENTAL_CRAWLER_AD_TYPE]
: null;
const rentalParsedCrawlerAdCategories =
process.env.RENTAL_CRAWLER_AD_CATEGORIES !== undefined
? process.env.RENTAL_CRAWLER_AD_CATEGORIES.split(",").map(category =>
category.trim()
)
: ["FLAT", "HOUSE"];
const rentalIgnoredUsernames = [];
const transformedRentalCrawlerAdCategories = rentalParsedCrawlerAdCategories
.map(categoryName =>
AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined
)
.filter(category => !!category);
module.exports = {
RENTAL_MAX_PAGES: parseInt(process.env.RENTAL_MAX_PAGES) || 500,
RENTAL_MAX_RESULTS_PER_PAGE:
parseInt(process.env.RENTAL_MAX_RESULTS_PER_PAGE) || 50,
RENTAL_CRAWLER_AD_TYPE: rentalCrawlerAdType || CRAWLER_AD_TYPE.NONE,
RENTAL_CRAWLER_AD_CATEGORIES: transformedRentalCrawlerAdCategories,
RENTAL_IGNORED_USERNAMES: rentalIgnoredUsernames || [],
RENTAL_DELAY_BETWEEN_PAGES:
parseInt(process.env.RENTAL_DELAY_BETWEEN_PAGES) || 1000
};

View File

@@ -13,7 +13,10 @@ const {
CRAWLER_AD_TYPE
} = require("../../common/enums");
const { DEFAULT_TIMEZONE } = require("../../config/appConfig");
const {
DEFAULT_TIMEZONE,
PRINT_CRAWLER_DEBUG
} = require("../../config/appConfig");
const OLX_ENUMS = {
OLX_AD_TYPE: {
@@ -156,6 +159,10 @@ class OlxCrawler {
}
async indexSinglePage(url, maxResultsPerPage) {
if (PRINT_CRAWLER_DEBUG) {
console.log("[OLX] Index page : ", url);
}
try {
const res = await fetch(url);
const body = await res.text();

View File

@@ -0,0 +1,252 @@
"use strict";
const fetch = require("node-fetch");
const cheerio = require("cheerio");
const {
AD_TYPE,
AD_CATEGORY,
AD_AGENCY,
AD_STATUS,
CRAWLER_AD_TYPE
} = require("../../common/enums");
const { PRINT_CRAWLER_DEBUG } = require("../../config/appConfig");
const PROSTOR_ENUMS = {
PROSTOR_AD_TYPE: {
[CRAWLER_AD_TYPE.ALL]: "&action=0",
[CRAWLER_AD_TYPE.ONLY_SELL]: "&action=1",
[CRAWLER_AD_TYPE.ONLY_RENT]: "&action=2"
},
PROSTOR_AD_CATEGORY: {
[AD_CATEGORY.ALL.id]: "",
[AD_CATEGORY.FLAT.id]: "&type=7",
[AD_CATEGORY.HOUSE.id]: "&type=8",
[AD_CATEGORY.LAND.id]: "&type=10",
[AD_CATEGORY.OFFICE.id]: "&type=9",
[AD_CATEGORY.APARTMENT.id]: "&type=11",
[AD_CATEGORY.GARAGE.id]: "&type=14"
//[AD_CATEGORY.COTTAGE.id]: ""
},
PROSTOR_PUBLISHED_DATE_FORMAT: "YYYY-MM-DD HH:mm:ss",
PROSTOR_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss"
};
class ProstorCrawler {
constructor(
savers = [],
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE],
maxPages = 5000,
maxResultsPerPage = 5000,
ignoredUsernames = [],
delayBetweenPages = 1000
) {
this.savers = savers;
this.baseUrl = "https://prostor.ba/pretraga";
this.crawlerAdTypes = crawlerAdTypes;
this.crawlerAdCategories = crawlerAdCategories;
this.maxResultsPerPage = maxResultsPerPage;
}
async crawl() {
const crawlAdCategories = this.crawlerAdCategories;
const newRealEstates = [];
if (crawlAdCategories) {
for (const adCategory of crawlAdCategories) {
const urlAdTypePart =
PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
const urlPageToCrawl = `${this.baseUrl}?remove_sold=1${urlAdTypePart}${urlCategoryPart}`;
const singleCategoryResults = await this.extractRealEstates(
urlPageToCrawl
);
const resultsSubset = singleCategoryResults.slice(
0,
this.maxResultsPerPage
);
const saveResults = await this.saveCrawledResults(resultsSubset);
const { newRecords } = saveResults;
newRealEstates.push(...newRecords);
}
}
}
return newRealEstates;
}
async extractRealEstates(url) {
if (PRINT_CRAWLER_DEBUG) {
console.log("[PROSTOR] Index page : ", url);
}
try {
const res = await fetch(url);
const body = await res.text();
const $ = cheerio.load(body);
const scriptElement = $(
"body > div > div.container-fluid > script:nth-child(7)"
);
if (
scriptElement[0] &&
scriptElement[0].children &&
scriptElement[0].children[0] &&
scriptElement[0].children[0].data
) {
const scriptData = scriptElement[0].children[0].data;
try {
// script element data contains JS code and we need to extract only data for realEstates
// data string starts with : var map; var markers = [{"r ...
// so we remove first 23 characters
//
// real estate JSON data ends with ...}, ]; map = new...
// so we need to find index of that substring to know where to stop
// we will NOT include trailing comma because it breaks JSON parse, so we have to close ] bracket manually
const jsonEndIndex = scriptData.indexOf(", ]; map = new");
if (jsonEndIndex > -1) {
const jsonData = scriptData.substring(23, jsonEndIndex) + "]";
const realEstates = JSON.parse(jsonData);
const transformedRealEstates = [];
for (const realEstate of realEstates) {
const transformedRealEstate = ProstorCrawler.transformRealEstateData(
realEstate
);
if (transformedRealEstate) {
transformedRealEstates.push(transformedRealEstate);
}
}
return transformedRealEstates;
} else {
throw {
message: "Something is wrong with JSON data or data is moved"
};
}
} catch (e) {
console.log(e);
throw { message: "Can't find ad data JSON" };
}
}
} catch (e) {
console.error("[PROSTOR] Exception caught:", e.message);
return [];
}
}
static transformRealEstateData(realEstateData) {
try {
const { lat, lng, property_name, price, size, link } = realEstateData;
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
// general form is : /actionType/realEstateType/location/realEstateID
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
const linkParts = link.split("/");
const adType = ProstorCrawler.getAdTypeId(linkParts[1]);
const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]);
const prostorId = linkParts[4];
const url = `https://prostor.ba${link}`;
if (!adType || !realEstateType || !prostorId) {
return null;
}
const adStatus = AD_STATUS.STATUS_NORMAL;
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
const parsedArea = parseFloat(size);
const data = {
url,
agencyObjectId: prostorId,
originAgencyName: AD_AGENCY.PROSTOR,
realEstateType,
adType,
title: property_name,
price: parsedPrice,
area: parsedArea,
gardenSize: null,
shortDescription: "",
longDescription: "",
streetNumber: 0,
streetName: "",
locality: "",
municipality: "",
city: "",
region: "",
entity: "",
country: "",
locationLat: lat,
locationLong: lng,
adStatus,
publishedDate: null,
renewedDate: null
};
return data;
} catch (e) {
console.error(
"[PROSTOR] Exception caught: " + e.message,
"\r\nURL:",
url
);
return null;
}
}
//======= HELPER FUNCTIONS =============
static getAdCategoryId(categoryText) {
switch (categoryText) {
case "stan":
return AD_CATEGORY.FLAT.id;
case "kuca":
return AD_CATEGORY.HOUSE.id;
case "apartman":
return AD_CATEGORY.APARTMENT.id;
case "poslovni-prostor":
return AD_CATEGORY.OFFICE.id;
case "garaza":
return AD_CATEGORY.GARAGE.id;
case "zemljiste":
return AD_CATEGORY.LAND.id;
default:
return undefined;
}
}
static getAdTypeId(adTypeText) {
switch (adTypeText) {
case "prodaja":
return AD_TYPE.AD_TYPE_SALE;
case "najam":
return AD_TYPE.AD_TYPE_RENT;
default:
return undefined;
}
}
async saveCrawledResults(results) {
const savers = this.savers;
// for (const saver of savers) {
// await saver.save(results);
// }
//For now, we use only Postgres saver, so ...
return await savers[0].save(results);
//so that we can use some sequelize options and information when data is inserted
}
}
module.exports = ProstorCrawler;

View File

@@ -14,7 +14,10 @@ const {
CRAWLER_AD_TYPE
} = require("../../common/enums");
const { DEFAULT_TIMEZONE } = require("../../config/appConfig");
const {
DEFAULT_TIMEZONE,
PRINT_CRAWLER_DEBUG
} = require("../../config/appConfig");
const RENTAL_ENUMS = {
RENTAL_AD_TYPE: {
@@ -159,7 +162,9 @@ class RentalCrawler {
}
async indexSinglePage(url, maxResultsPerPage) {
// console.log("[RENTAL] Index page : ", url);
if (PRINT_CRAWLER_DEBUG) {
console.log("[RENTAL] Index page : ", url);
}
try {
const res = await fetch(url);

View File

@@ -23,6 +23,7 @@ SOURCE_EMAIL=info@saburly.com
#=============== CRAWLER SETTINGS===============#
CRAWLER_INTERVAL=Interval to run cralwer(s), in seconds
STOP_CRAWLER=Non-zero value will skip crawler execution
PRINT_CRAWLER_DEBUG_INFO=Non-zero value will print crawler debugging info to the server console
#==OLX==
OLX_MAX_PAGES=Restrict crawler to this number of pages
OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
@@ -37,3 +38,10 @@ RENTAL_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check co
RENTAL_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
RENTAL_IGNORED_USERNAMES=!!! This is not used for rental crawler !!!
RENTAL_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
#==PROSTOR==
PROSTOR_MAX_PAGES=!!! This is not used for prostor crawler !!!
PROSTOR_MAX_RESULTS_PER_PAGE=For Prostor crawler, this represents MAX RESULTS in total
PROSTOR_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
PROSTOR_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
PROSTOR_IGNORED_USERNAMES=!!! This is not used for prostor crawler !!!
PROSTOR_DELAY_BETWEEN_PAGES=!!! This is not used for prostor crawler !!!

View File

@@ -1,6 +1,6 @@
"use strict";
const olxCrawler = require("../app/crawler/specific/olx");
const olxCrawler = require("../app/crawler/specificCrawlers/olx");
const urlToScrape = process.argv[2] || undefined;