Merge branch 'make-crawler-smarter' into 'master'

Make crawler smarter

See merge request saburly/marketalarm/web!33
This commit was merged in pull request #33.
This commit is contained in:
Bilal Catic
2019-09-25 17:15:14 +00:00
12 changed files with 351 additions and 106 deletions

View File

@@ -12,8 +12,6 @@ const AD_CATEGORY = {
CATEGORY_GARAGE: "GARAGE" CATEGORY_GARAGE: "GARAGE"
}; };
const IGNORED_USERNAMES = [];
const AD_STATUS = { const AD_STATUS = {
STATUS_NORMAL: 1, STATUS_NORMAL: 1,
STATUS_RESERVED: 2, STATUS_RESERVED: 2,
@@ -36,7 +34,6 @@ const CRAWLER_AD_TYPE = {
module.exports = { module.exports = {
AD_TYPE, AD_TYPE,
IGNORED_USERNAMES,
AD_CATEGORY, AD_CATEGORY,
AD_STATUS, AD_STATUS,
AD_AGENCY, AD_AGENCY,

View File

@@ -6,7 +6,10 @@ const APP_URL =
? process.env.APP_URL || "http://market-alarm" ? process.env.APP_URL || "http://market-alarm"
: process.env.APP_URL || `${APP_BASE_URL}:${APP_PORT}`; : process.env.APP_URL || `${APP_BASE_URL}:${APP_PORT}`;
const DEFAULT_TIMEZONE = "Europe/Sarajevo";
module.exports = { module.exports = {
APP_PORT, APP_PORT,
APP_URL APP_URL,
DEFAULT_TIMEZONE
}; };

View File

@@ -13,23 +13,28 @@ const PostgresSaver = require("./savers/postgres");
const crawlers = [ const crawlers = [
new OlxCrawler( new OlxCrawler(
OLX_CONFIG.OLX_START_PAGE,
OLX_CONFIG.OLX_END_PAGE,
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
[new PostgresSaver()], [new PostgresSaver()],
OLX_CONFIG.OLX_CRAWLER_AD_TYPE, OLX_CONFIG.OLX_CRAWLER_AD_TYPE,
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES,
OLX_CONFIG.OLX_MAX_PAGES,
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
OLX_CONFIG.OLX_IGNORED_USERNAMES,
OLX_CONFIG.OLX_DELAY_BETWEEN_PAGES
) )
]; ];
async function crawlAll() { async function crawlAll() {
for (let crawler of crawlers) { for (let crawler of crawlers) {
try { try {
await crawler.crawl(); const newRealEstates = await crawler.crawl();
console.log("Number of new real estates : ", newRealEstates.length);
} catch (e) { } catch (e) {
console.log("Error crawling. Trying next crawler! ", e); console.log("Error crawling. Trying next crawler! ", e);
} }
} }
} }
crawlAll(); (async () => {
await crawlAll();
})();

View File

@@ -2,29 +2,37 @@
require("dotenv").config({ path: "../../.env" }); require("dotenv").config({ path: "../../.env" });
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../common/enums"); const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../common/enums");
const crawlerAdType = const olxCrawlerAdType =
process.env.OLX_CRAWLER_AD_TYPE !== undefined process.env.OLX_CRAWLER_AD_TYPE !== undefined
? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE] ? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE]
: null; : null;
const parsedCrawlerAdCategories = const olxParsedCrawlerAdCategories =
process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined
? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category => ? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category =>
category.trim() category.trim()
) )
: ["CATEGORY_FLAT", "CATEGORY_HOUSE"]; : ["CATEGORY_FLAT", "CATEGORY_HOUSE"];
const transformedCrawlerAdCategories = parsedCrawlerAdCategories const olxIgnoredUsernames =
process.env.OLX_IGNORED_USERNAMES !== undefined
? process.env.OLX_IGNORED_USERNAMES.split(",").map(username =>
username.trim()
)
: [];
const transformedCrawlerAdCategories = olxParsedCrawlerAdCategories
.map(categoryName => AD_CATEGORY[categoryName]) .map(categoryName => AD_CATEGORY[categoryName])
.filter(category => !!category); .filter(category => !!category);
const OLX_CONFIG = { const OLX_CONFIG = {
OLX_START_PAGE: parseInt(process.env.OLX_START_PAGE) || 1, OLX_MAX_PAGES: parseInt(process.env.OLX_MAX_PAGES) || 500,
OLX_END_PAGE: parseInt(process.env.OLX_END_PAGE) || 10,
OLX_MAX_RESULTS_PER_PAGE: OLX_MAX_RESULTS_PER_PAGE:
parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50, parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50,
OLX_CRAWLER_AD_TYPE: crawlerAdType || CRAWLER_AD_TYPE.NONE, OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE,
OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories,
OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [],
OLX_DELAY_BETWEEN_PAGES: parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000
}; };
module.exports = { module.exports = {

View File

@@ -1,3 +1,5 @@
const moment = require("moment");
const { bulkUpsertRealEstates } = require("../../helpers/db/realEstate"); const { bulkUpsertRealEstates } = require("../../helpers/db/realEstate");
class PostgresSaver { class PostgresSaver {
@@ -9,7 +11,33 @@ class PostgresSaver {
async save(results) { async save(results) {
console.log("[POSTGRES] Saving..."); console.log("[POSTGRES] Saving...");
await bulkUpsertRealEstates(results);
const savedRecords = await bulkUpsertRealEstates(results);
if (Array.isArray(savedRecords)) {
const newRealEstates = [];
const existingRealEstates = [];
for (const savedRecord of savedRecords) {
const { createdAt, updatedAt } = savedRecord;
const createdAtMoment = moment.utc(createdAt);
const updatedAtMoment = moment.utc(updatedAt);
if (createdAtMoment.isSame(updatedAtMoment, "second")) {
newRealEstates.push(savedRecord);
} else {
existingRealEstates.push(savedRecord);
}
}
return {
newRecords: newRealEstates,
existingRecords: existingRealEstates
};
} else {
throw { message: "[POSTGRES] Failed to save records" };
}
} }
close() { close() {

View File

@@ -1,102 +1,170 @@
"use strict"; "use strict";
let fetch = require("node-fetch"); const fetch = require("node-fetch");
let cheerio = require("cheerio"); const cheerio = require("cheerio");
const Promise = require("bluebird");
const moment = require("moment-timezone");
const { const {
AD_TYPE, AD_TYPE,
AD_CATEGORY, AD_CATEGORY,
IGNORED_USERNAMES,
AD_AGENCY, AD_AGENCY,
AD_STATUS, AD_STATUS,
CRAWLER_AD_TYPE CRAWLER_AD_TYPE
} = require("../../common/enums"); } = require("../../common/enums");
const { DEFAULT_TIMEZONE } = require("../../config/appConfig");
const OLX_ENUMS = { const OLX_ENUMS = {
OLX_AD_TYPE: {}, OLX_AD_TYPE: {
OLX_AD_CATEGORY: {}, [CRAWLER_AD_TYPE.ALL]: "",
MAX_DETAIL_FIELDS: 30 [CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja",
[CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje"
},
OLX_AD_CATEGORY: {
[AD_CATEGORY.CATEGORY_FLAT]: "&kategorija=23",
[AD_CATEGORY.CATEGORY_HOUSE]: "&kategorija=24",
[AD_CATEGORY.CATEGORY_LAND]: "&kategorija=29",
[AD_CATEGORY.CATEGORY_OFFICE]: "&kategorija=25",
[AD_CATEGORY.CATEGORY_APARTMENT]: "&kategorija=27",
[AD_CATEGORY.CATEGORY_GARAGE]: "&kategorija=30"
},
MAX_DETAIL_FIELDS: 30,
OLX_PUBLISHED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm",
OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm"
}; };
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ALL] = "";
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_SELL] = "&vrsta=samoprodaja";
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_RENT] = "&vrsta=samoizdavanje";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_FLAT] = "&kategorija=23";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_HOUSE] = "&kategorija=24";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_LAND] = "&kategorija=29";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_OFFICE] = "&kategorija=25";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_APARTMENT] = "&kategorija=27";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_GARAGE] = "&kategorija=30";
class OlxCrawler { class OlxCrawler {
constructor( constructor(
fromPage = 1,
toPage = 10,
maxResults = 1000,
savers = [], savers = [],
crawlerAdTypes = CRAWLER_AD_TYPE.ALL, crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
crawlerAdCategories = [ crawlerAdCategories = [
AD_CATEGORY.CATEGORY_FLAT, AD_CATEGORY.CATEGORY_FLAT,
AD_CATEGORY.CATEGORY_HOUSE AD_CATEGORY.CATEGORY_HOUSE
] ],
maxPages = 1000,
maxResultsPerPage = 100,
ignoredUsernames = [],
delayBetweenPages = 1000
) { ) {
this.fromPage = fromPage;
this.toPage = toPage;
this.maxResults = maxResults;
this.savers = savers; this.savers = savers;
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum"; this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdTypes = crawlerAdTypes;
this.crawlerAdCategories = crawlerAdCategories; this.crawlerAdCategories = crawlerAdCategories;
this.maxPages = maxPages;
this.maxResultsPerPage = maxResultsPerPage;
this.ignoredUsernames = ignoredUsernames;
this.delayBetweenPages = delayBetweenPages;
} }
async crawl() { async crawl() {
console.log("[OLX] Crawler started"); console.log("[OLX] Crawler started");
const crawlAdTypes = this.crawlerAdTypes;
const crawlAdCategories = this.crawlerAdCategories; const crawlAdCategories = this.crawlerAdCategories;
const urlWithAdTypeFilter = `${this.baseUrl}${OLX_ENUMS.OLX_AD_TYPE[crawlAdTypes]}`; const newRealEstates = [];
if (crawlAdCategories && crawlAdTypes) { if (crawlAdCategories) {
const asyncPagesIndexingByCategory = []; const indexGenerators = [];
for (const adCategory of crawlAdCategories) { for (const adCategory of crawlAdCategories) {
asyncPagesIndexingByCategory.push( indexGenerators.push(this.categoryIndexer(adCategory));
this.indexPages(
`${urlWithAdTypeFilter}${OLX_ENUMS.OLX_AD_CATEGORY[adCategory]}`
)
);
} }
await Promise.all(asyncPagesIndexingByCategory); let done = false;
while (!done) {
const categoryIndexerPromises = [];
const generatorsToRemove = [];
for (const indexGenerator of indexGenerators) {
categoryIndexerPromises.push(indexGenerator.next());
generatorsToRemove.push(false);
}
const singlePageResults = await Promise.all(categoryIndexerPromises);
const entries = singlePageResults.entries();
for (const [index, { value: singlePageResult }] of entries) {
if (singlePageResult) {
const saveResults = await this.saveCrawledResults(singlePageResult);
const { newRecords, existingRecords } = saveResults;
newRealEstates.push(...newRecords);
for (const existingRecord of existingRecords) {
const { publishedDate, renewedDate } = existingRecord;
const publishedDateMoment = moment.utc(publishedDate);
const renewedDateMoment = moment.utc(renewedDate);
const stopCrawlingThisCategory = publishedDateMoment.isSame(
renewedDateMoment,
"minute"
);
if (stopCrawlingThisCategory) {
generatorsToRemove[index] = true;
// console.log("\tGenerator ", index + 1, "has no more new ads");
break;
}
}
} else {
//Generator returned undefined, remove this generator from array
generatorsToRemove[index] = true;
// console.log("Generator ", index + 1, "has no more pages");
}
}
// console.log("Generators state : ", generatorsToRemove);
for (let i = generatorsToRemove.length - 1; i >= 0; i--) {
if (generatorsToRemove[i]) {
// console.log("\tRemove generator ", i + 1);
indexGenerators.splice(i, 1);
}
}
if (indexGenerators.length === 0) {
done = true;
}
await this.sleep(this.delayBetweenPages);
}
} }
console.log("[OLX] Crawler finished"); console.log("[OLX] Crawler finished");
return newRealEstates;
} }
async indexPages(url) { async *categoryIndexer(adCategory) {
const startPage = this.fromPage; let pageToIndex = 1;
const endPage = this.toPage;
const maxResultsPerPage = this.maxResults;
for (let pageNumber = startPage; pageNumber <= endPage; pageNumber++) { const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
const singlePageResults = await this.indexSinglePage( const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
url, if (urlAdTypePart && urlCategoryPart) {
pageNumber, while (true) {
maxResultsPerPage const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
); const singlePageResults = await this.indexSinglePage(
await this.saveCrawledResults(singlePageResults); urlPageToCrawl,
await this.sleep(5000); this.maxResultsPerPage
);
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
yield singlePageResults;
} else {
return undefined;
}
++pageToIndex;
if (pageToIndex === this.maxPages) {
return undefined;
}
}
} else {
return undefined;
} }
} }
async indexSinglePage(urlWithoutPageNumber, pageNumber, maxResultsPerPage) { async indexSinglePage(url, maxResultsPerPage) {
try { try {
const url = `${urlWithoutPageNumber}&stranica=${pageNumber}`;
const res = await fetch(url); const res = await fetch(url);
const body = await res.text(); const body = await res.text();
const $ = cheerio.load(body); const $ = cheerio.load(body);
let hrefs = []; let hrefs = [];
const singlePageResults = [];
$("#rezultatipretrage") $("#rezultatipretrage")
.find(".listitem") .find(".listitem")
@@ -113,50 +181,60 @@ class OlxCrawler {
let actualNoOfResults = let actualNoOfResults =
hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage; hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage;
const asyncScraping = [];
for (let i = 0; i < actualNoOfResults; i++) { for (let i = 0; i < actualNoOfResults; i++) {
console.log(`Scraping : ${hrefs[i]}`); asyncScraping.push(this.scrapeAd(hrefs[i]));
const adData = await this.scrapeAd(hrefs[i]);
if (adData) {
singlePageResults.push(adData);
}
await this.sleep(500);
} }
return singlePageResults; const scrapedData = await Promise.all(asyncScraping);
const filteredScrapedData = scrapedData.filter(adData => !!adData);
return filteredScrapedData;
} catch (e) { } catch (e) {
console.error("Exception caught:" + e); console.error("Exception caught:" + e);
return [];
} }
} }
async scrapeAd(url) { async scrapeAd(url) {
//console.log("Scraping : ", url);
try { try {
const adPageSource = await fetch(url); const adPageSource = await fetch(url);
const body = await adPageSource.text(); const body = await adPageSource.text();
const $ = cheerio.load(body); const $ = cheerio.load(body);
let status = AD_STATUS.STATUS_NORMAL; let status = AD_STATUS.STATUS_NORMAL;
const username = $( const propertySelectors = {
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span" username:
).text(); "#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span",
title: "#naslovartikla",
descriptions: ".artikal_detaljniopis_tekst",
category:
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
};
if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) { const username = $(propertySelectors.username)
.text()
.trim();
if (this.ignoredUsernames.includes((username || "").toLowerCase())) {
return null; return null;
} }
const title = $("#naslovartikla").text(); const title = $(propertySelectors.title)
const descriptions = $(".artikal_detaljniopis_tekst"); .text()
const category = $( .trim();
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" const descriptions = $(propertySelectors.descriptions);
).text(); const category = $(propertySelectors.category)
.text()
.trim();
//====== PRICE DETECTION AND EXTRACTION ===== //====== PRICE DETECTION AND EXTRACTION =====
let price = null; let price = null;
const normalPriceValue = $("#pc > p:nth-child(2)").text(); const normalPriceValue = $("#pc > p:nth-child(2)").text();
const urgentPriceValue = $( const urgentPriceValue = $(
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p" "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p"
).text(); )
.text()
.trim();
if (normalPriceValue && normalPriceValue.length > 0) { if (normalPriceValue && normalPriceValue.length > 0) {
price = normalPriceValue; price = normalPriceValue;
@@ -208,6 +286,39 @@ class OlxCrawler {
} }
const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`; const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`;
const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`;
const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`;
const publishedDate = $(publishedDateValueSelector)
.text()
.trim();
const publishedDateMoment = moment.tz(
publishedDate,
OLX_ENUMS.OLX_PUBLISHED_DATE_FORMAT,
DEFAULT_TIMEZONE
);
if (!publishedDateMoment.isValid()) {
throw { message: "Invalid published date ! Check parsing format" };
}
const renewedDate = $(renewedDateFullValueSelector)
.data("content")
.trim();
const renewedDateMoment = moment.tz(
renewedDate,
OLX_ENUMS.OLX_RENEWED_DATE_FORMAT,
DEFAULT_TIMEZONE
);
if (!renewedDateMoment) {
throw {
message:
"Invalid renewed date ! Check how parser parsed renewed date text"
};
}
adType = $( adType = $(
`#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2` `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2`
@@ -262,7 +373,9 @@ class OlxCrawler {
const time = $("time").attr("datetime"); const time = $("time").attr("datetime");
const numberOfViews = $( const numberOfViews = $(
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2" "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2"
).text(); )
.text()
.trim();
//=========================================== //===========================================
//========================================= //=========================================
@@ -300,8 +413,14 @@ class OlxCrawler {
price: parsedPrice, price: parsedPrice,
area: parsedArea, area: parsedArea,
gardenSize: parsedGardenSize, gardenSize: parsedGardenSize,
shortDescription: descriptions.first().text(), shortDescription: descriptions
longDescription: descriptions.last().text(), .first()
.text()
.trim(),
longDescription: descriptions
.last()
.text()
.trim(),
streetNumber: 0, streetNumber: 0,
streetName: "", streetName: "",
locality: "", locality: "",
@@ -312,7 +431,9 @@ class OlxCrawler {
country: "", country: "",
locationLat, locationLat,
locationLong, locationLong,
adStatus: status adStatus: status,
publishedDate: publishedDateMoment.toISOString(),
renewedDate: renewedDateMoment.toISOString()
}; };
return data; return data;
@@ -334,6 +455,8 @@ class OlxCrawler {
return AD_CATEGORY.CATEGORY_HOUSE; return AD_CATEGORY.CATEGORY_HOUSE;
case "Poslovni prostori": case "Poslovni prostori":
return AD_CATEGORY.CATEGORY_OFFICE; return AD_CATEGORY.CATEGORY_OFFICE;
case "Apartmani":
return AD_CATEGORY.CATEGORY_APARTMENT;
default: default:
return undefined; return undefined;
} }
@@ -370,6 +493,58 @@ class OlxCrawler {
return parseFloat(formattedPriceText); return parseFloat(formattedPriceText);
} }
parseRenewedDate(renewedDateText) {
const currentMoment = moment.tz(DEFAULT_TIMEZONE);
if (renewedDateText.includes("Prije mjesec dana")) {
return currentMoment.add(-1, "month");
}
if (renewedDateText.includes("Jučer")) {
return currentMoment.add(-1, "day");
}
if (renewedDateText.includes("Prije sat")) {
return currentMoment.add(-1, "hour");
}
if (renewedDateText.includes("dan")) {
// format for this case should be "Prije N dana" or "Prije N dan"
const dateParts = renewedDateText.split(" ");
if (dateParts[0] === "Prije") {
const numberOfDays = parseInt(dateParts[1]);
return currentMoment.add(-1 * numberOfDays, "days");
} else {
return undefined;
}
}
if (renewedDateText.includes("sat")) {
const dateParts = renewedDateText.split(" ");
const parsedHours =
dateParts && dateParts.length > 2 ? parseInt(dateParts[1]) : undefined;
if (!parsedHours) {
return undefined;
}
return currentMoment.add(-1 * parsedHours, "hours");
}
const todayVariations = ["min", "sekund", "maloprije"];
for (const todayVariation of todayVariations) {
if (renewedDateText.includes(todayVariation)) {
return currentMoment;
}
}
const renewedDateMoment = moment.tz(
renewedDateText,
OLX_ENUMS.OLX_RENEWED_DATE_FORMAT,
DEFAULT_TIMEZONE
);
return renewedDateMoment.isValid() ? renewedDateMoment : undefined;
}
async sleep(ms) { async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms)); return new Promise(resolve => setTimeout(resolve, ms));
} }
@@ -377,9 +552,13 @@ class OlxCrawler {
async saveCrawledResults(results) { async saveCrawledResults(results) {
const savers = this.savers; const savers = this.savers;
for (const saver of savers) { // for (const saver of savers) {
await saver.save(results); // await saver.save(results);
} // }
//For now, we use only Postgres saver, so ...
return await savers[0].save(results);
//so that we can use some sequelize options and information when data is inserted
} }
} }

View File

@@ -23,10 +23,13 @@ const bulkUpsertRealEstates = async realEstateData => {
"longDescription", "longDescription",
"gardenSize", "gardenSize",
"adStatus", "adStatus",
"updatedAt" "updatedAt",
"renewedDate"
]; ];
return await db.RealEstate.bulkCreate(realEstateData, { return await db.RealEstate.bulkCreate(realEstateData, {
updateOnDuplicate: fieldsToUpdateIfDuplicate updateOnDuplicate: fieldsToUpdateIfDuplicate,
returning: true
}); });
} catch (e) { } catch (e) {
console.log("Error bulk upserting realEstates : ", e); console.log("Error bulk upserting realEstates : ", e);

View File

@@ -0,0 +1,21 @@
"use strict";
module.exports = {
up: (queryInterface, Sequelize) => {
return Promise.all([
queryInterface.addColumn("RealEstates", "publishedDate", {
type: Sequelize.DATE
}),
queryInterface.addColumn("RealEstates", "renewedDate", {
type: Sequelize.DATE
})
]);
},
down: (queryInterface, Sequelize) => {
return Promise.all([
queryInterface.removeColumn("RealEstates", "renewedDate"),
queryInterface.removeColumn("RealEstates", "publishedDate")
]);
}
};

View File

@@ -43,14 +43,12 @@ module.exports = (sequelize, DataTypes) => {
country: DataTypes.TEXT, country: DataTypes.TEXT,
locationLat: DataTypes.REAL, locationLat: DataTypes.REAL,
locationLong: DataTypes.REAL, locationLong: DataTypes.REAL,
lastTimeCrawled: {
type: DataTypes.DATE,
allowNull: false
},
title: DataTypes.TEXT, title: DataTypes.TEXT,
shortDescription: DataTypes.TEXT, shortDescription: DataTypes.TEXT,
longDescription: DataTypes.TEXT, longDescription: DataTypes.TEXT,
adStatus: DataTypes.INTEGER adStatus: DataTypes.INTEGER,
publishedDate: DataTypes.DATE,
renewedDate: DataTypes.DATE
}); });
RealEstate.associate = models => { RealEstate.associate = models => {

View File

@@ -16,8 +16,9 @@ SOURCE_EMAIL=info@saburly.com
#=============== CRAWLER SETTINGS===============# #=============== CRAWLER SETTINGS===============#
#==OLX== #==OLX==
OLX_START_PAGE=Crawler starts from this page OLX_MAX_PAGES=Restrict crawler to this number of pages
OLX_END_PAGE=Crawler ends with this page (including this page)
OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
OLX_IGNORED_USERNAMES=comma separated list of usernames to ignore
OLX_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page

6
package-lock.json generated
View File

@@ -2697,9 +2697,9 @@
"integrity": "sha512-bV7f+6l2QigeBBZSM/6yTNq4P2fNpSWj/0e7jQcy87A8e7o2nAfP/34/2ky5Vw4B9S446EtIhodAzkFCcR4dQg==" "integrity": "sha512-bV7f+6l2QigeBBZSM/6yTNq4P2fNpSWj/0e7jQcy87A8e7o2nAfP/34/2ky5Vw4B9S446EtIhodAzkFCcR4dQg=="
}, },
"moment-timezone": { "moment-timezone": {
"version": "0.5.25", "version": "0.5.26",
"resolved": "https://registry.npmjs.org/moment-timezone/-/moment-timezone-0.5.25.tgz", "resolved": "https://registry.npmjs.org/moment-timezone/-/moment-timezone-0.5.26.tgz",
"integrity": "sha512-DgEaTyN/z0HFaVcVbSyVCUU6HeFdnNC3vE4c9cgu2dgMTvjBUBdBzWfasTBmAW45u5OIMeCJtU8yNjM22DHucw==", "integrity": "sha512-sFP4cgEKTCymBBKgoxZjYzlSovC20Y6J7y3nanDc5RoBIXKlZhoYwBoZGe3flwU6A372AcRwScH8KiwV6zjy1g==",
"requires": { "requires": {
"moment": ">= 2.9.0" "moment": ">= 2.9.0"
} }

View File

@@ -34,6 +34,8 @@
"express": "^4.16.4", "express": "^4.16.4",
"express-ejs-layouts": "^2.5.0", "express-ejs-layouts": "^2.5.0",
"express-layout": "^0.1.0", "express-layout": "^0.1.0",
"moment": "^2.24.0",
"moment-timezone": "^0.5.26",
"node-fetch": "^2.3.0", "node-fetch": "^2.3.0",
"node-schedule": "^1.3.2", "node-schedule": "^1.3.2",
"pg": "^7.10.0", "pg": "^7.10.0",