use function generator to index pages; crawl in parallel

This commit is contained in:
Bilal Catic
2019-09-23 10:46:31 +02:00
parent c4f6c6e1c3
commit 3140fdf0c0
6 changed files with 127 additions and 77 deletions

View File

@@ -13,12 +13,12 @@ const PostgresSaver = require("./savers/postgres");
const crawlers = [ const crawlers = [
new OlxCrawler( new OlxCrawler(
OLX_CONFIG.OLX_START_PAGE,
OLX_CONFIG.OLX_END_PAGE,
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
[new PostgresSaver()], [new PostgresSaver()],
OLX_CONFIG.OLX_CRAWLER_AD_TYPE, OLX_CONFIG.OLX_CRAWLER_AD_TYPE,
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES,
OLX_CONFIG.OLX_MAX_PAGES,
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
OLX_CONFIG.OLX_MAX_AGE
) )
]; ];

View File

@@ -2,29 +2,29 @@
require("dotenv").config({ path: "../../.env" }); require("dotenv").config({ path: "../../.env" });
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../common/enums"); const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../common/enums");
const crawlerAdType = const olxCrawlerAdType =
process.env.OLX_CRAWLER_AD_TYPE !== undefined process.env.OLX_CRAWLER_AD_TYPE !== undefined
? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE] ? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE]
: null; : null;
const parsedCrawlerAdCategories = const olxParsedCrawlerAdCategories =
process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined
? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category => ? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category =>
category.trim() category.trim()
) )
: ["CATEGORY_FLAT", "CATEGORY_HOUSE"]; : ["CATEGORY_FLAT", "CATEGORY_HOUSE"];
const transformedCrawlerAdCategories = parsedCrawlerAdCategories const transformedCrawlerAdCategories = olxParsedCrawlerAdCategories
.map(categoryName => AD_CATEGORY[categoryName]) .map(categoryName => AD_CATEGORY[categoryName])
.filter(category => !!category); .filter(category => !!category);
const OLX_CONFIG = { const OLX_CONFIG = {
OLX_START_PAGE: parseInt(process.env.OLX_START_PAGE) || 1, OLX_MAX_PAGES: parseInt(process.env.MAX_PAGES) || 500,
OLX_END_PAGE: parseInt(process.env.OLX_END_PAGE) || 10,
OLX_MAX_RESULTS_PER_PAGE: OLX_MAX_RESULTS_PER_PAGE:
parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50, parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50,
OLX_CRAWLER_AD_TYPE: crawlerAdType || CRAWLER_AD_TYPE.NONE, OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE,
OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories,
OLX_MAX_AGE: parseInt(process.env.OLX_MAX_AGE) || 30
}; };
module.exports = { module.exports = {

View File

@@ -7,9 +7,9 @@ class PostgresSaver {
return true; return true;
} }
async save(results) { async save(results, maxAge) {
console.log("[POSTGRES] Saving..."); console.log("[POSTGRES] Saving...");
await bulkUpsertRealEstates(results); await bulkUpsertRealEstates(results, maxAge);
} }
close() { close() {

View File

@@ -1,7 +1,8 @@
"use strict"; "use strict";
let fetch = require("node-fetch"); const fetch = require("node-fetch");
let cheerio = require("cheerio"); const cheerio = require("cheerio");
const Promise = require("bluebird");
const { const {
AD_TYPE, AD_TYPE,
@@ -13,78 +14,113 @@ const {
} = require("../../common/enums"); } = require("../../common/enums");
const OLX_ENUMS = { const OLX_ENUMS = {
OLX_AD_TYPE: {}, OLX_AD_TYPE: {
OLX_AD_CATEGORY: {}, [CRAWLER_AD_TYPE.ALL]: "",
[CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja",
[CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje"
},
OLX_AD_CATEGORY: {
[AD_CATEGORY.CATEGORY_FLAT]: "&kategorija=23",
[AD_CATEGORY.CATEGORY_HOUSE]: "&kategorija=24",
[AD_CATEGORY.CATEGORY_LAND]: "&kategorija=29",
[AD_CATEGORY.CATEGORY_OFFICE]: "&kategorija=25",
[AD_CATEGORY.CATEGORY_APARTMENT]: "&kategorija=27",
[AD_CATEGORY.CATEGORY_GARAGE]: "&kategorija=30"
},
MAX_DETAIL_FIELDS: 30 MAX_DETAIL_FIELDS: 30
}; };
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ALL] = "";
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_SELL] = "&vrsta=samoprodaja";
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_RENT] = "&vrsta=samoizdavanje";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_FLAT] = "&kategorija=23";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_HOUSE] = "&kategorija=24";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_LAND] = "&kategorija=29";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_OFFICE] = "&kategorija=25";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_APARTMENT] = "&kategorija=27";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_GARAGE] = "&kategorija=30";
class OlxCrawler { class OlxCrawler {
constructor( constructor(
fromPage = 1,
toPage = 10,
maxResults = 1000,
savers = [], savers = [],
crawlerAdTypes = CRAWLER_AD_TYPE.ALL, crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
crawlerAdCategories = [ crawlerAdCategories = [
AD_CATEGORY.CATEGORY_FLAT, AD_CATEGORY.CATEGORY_FLAT,
AD_CATEGORY.CATEGORY_HOUSE AD_CATEGORY.CATEGORY_HOUSE
] ],
maxPages = 1000,
maxResultsPerPage = 100,
maxAge = 30
) { ) {
this.fromPage = fromPage;
this.toPage = toPage;
this.maxResults = maxResults;
this.savers = savers; this.savers = savers;
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum"; this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdTypes = crawlerAdTypes;
this.crawlerAdCategories = crawlerAdCategories; this.crawlerAdCategories = crawlerAdCategories;
this.maxPages = maxPages;
this.maxResultsPerPage = maxResultsPerPage;
this.maxAge = maxAge;
} }
async crawl() { async crawl() {
console.log("[OLX] Crawler started"); console.log("[OLX] Crawler started");
const crawlAdTypes = this.crawlerAdTypes;
const crawlAdCategories = this.crawlerAdCategories; const crawlAdCategories = this.crawlerAdCategories;
const urlWithAdTypeFilter = `${this.baseUrl}${OLX_ENUMS.OLX_AD_TYPE[crawlAdTypes]}`; if (crawlAdCategories) {
const indexGenerators = [];
if (crawlAdCategories && crawlAdTypes) {
const asyncPagesIndexingByCategory = [];
for (const adCategory of crawlAdCategories) { for (const adCategory of crawlAdCategories) {
asyncPagesIndexingByCategory.push( indexGenerators.push(this.categoryIndexer(adCategory));
this.indexPages(
`${urlWithAdTypeFilter}${OLX_ENUMS.OLX_AD_CATEGORY[adCategory]}`
)
);
} }
await Promise.all(asyncPagesIndexingByCategory); let done = false;
while (!done) {
const categoryIndexerPromises = [];
for (const indexGenerator of indexGenerators) {
categoryIndexerPromises.push(indexGenerator.next());
}
Promise.all(categoryIndexerPromises).then(singlePageResults => {
const entries = singlePageResults.entries();
for (const [index, { value: singlePageResult }] of entries) {
if (singlePageResult) {
this.saveCrawledResults(singlePageResult, this.maxAge)
.then(numberOfSaved => {})
.catch(error =>
console.log("[POSTGRES Saver] Error saving results : ", error)
);
} else {
//Generator returned undefined, no more pages
indexGenerators.splice(index, 1);
if (indexGenerators.length === 0) {
done = true;
}
}
}
});
await this.sleep(500);
}
} }
console.log("[OLX] Crawler finished"); console.log("[OLX] Crawler finished");
} }
async indexPages(url) { async *categoryIndexer(adCategory) {
const startPage = this.fromPage; let pageToIndex = 1;
const endPage = this.toPage;
const maxResultsPerPage = this.maxResults;
for (let pageNumber = startPage; pageNumber <= endPage; pageNumber++) { const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
const pageUrl = `${url}&stranica=${pageNumber}`; const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
const singlePageResults = await this.indexSinglePage( if (urlAdTypePart && urlCategoryPart) {
pageUrl, while (true) {
maxResultsPerPage const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
); const singlePageResults = await this.indexSinglePage(
await this.saveCrawledResults(singlePageResults); urlPageToCrawl,
await this.sleep(5000); this.maxResultsPerPage
);
console.log("indexing ", adCategory, " page : ", pageToIndex);
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
yield singlePageResults;
} else {
return undefined;
}
++pageToIndex;
if (pageToIndex === this.maxPages) {
return undefined;
}
}
} else {
return undefined;
} }
} }
@@ -111,18 +147,16 @@ class OlxCrawler {
let actualNoOfResults = let actualNoOfResults =
hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage; hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage;
const asyncScraping = [];
for (let i = 0; i < actualNoOfResults; i++) { for (let i = 0; i < actualNoOfResults; i++) {
const adData = await this.scrapeAd(hrefs[i]); asyncScraping.push(this.scrapeAd(hrefs[i]));
if (adData) {
singlePageResults.push(adData);
}
await this.sleep(500);
} }
return singlePageResults; const scrapedData = await Promise.all(asyncScraping);
return scrapedData;
} catch (e) { } catch (e) {
console.error("Exception caught:" + e); console.error("Exception caught:" + e);
return [];
} }
} }
@@ -135,24 +169,32 @@ class OlxCrawler {
const username = $( const username = $(
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span" "#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span"
).text(); )
.text()
.trim();
if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) { if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) {
return null; return null;
} }
const title = $("#naslovartikla").text(); const title = $("#naslovartikla")
.text()
.trim();
const descriptions = $(".artikal_detaljniopis_tekst"); const descriptions = $(".artikal_detaljniopis_tekst");
const category = $( const category = $(
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
).text(); )
.text()
.trim();
//====== PRICE DETECTION AND EXTRACTION ===== //====== PRICE DETECTION AND EXTRACTION =====
let price = null; let price = null;
const normalPriceValue = $("#pc > p:nth-child(2)").text(); const normalPriceValue = $("#pc > p:nth-child(2)").text();
const urgentPriceValue = $( const urgentPriceValue = $(
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p" "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p"
).text(); )
.text()
.trim();
if (normalPriceValue && normalPriceValue.length > 0) { if (normalPriceValue && normalPriceValue.length > 0) {
price = normalPriceValue; price = normalPriceValue;
@@ -258,7 +300,9 @@ class OlxCrawler {
const time = $("time").attr("datetime"); const time = $("time").attr("datetime");
const numberOfViews = $( const numberOfViews = $(
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2" "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2"
).text(); )
.text()
.trim();
//=========================================== //===========================================
//========================================= //=========================================
@@ -296,8 +340,14 @@ class OlxCrawler {
price: parsedPrice, price: parsedPrice,
area: parsedArea, area: parsedArea,
gardenSize: parsedGardenSize, gardenSize: parsedGardenSize,
shortDescription: descriptions.first().text(), shortDescription: descriptions
longDescription: descriptions.last().text(), .first()
.text()
.trim(),
longDescription: descriptions
.last()
.text()
.trim(),
streetNumber: 0, streetNumber: 0,
streetName: "", streetName: "",
locality: "", locality: "",
@@ -370,11 +420,11 @@ class OlxCrawler {
return new Promise(resolve => setTimeout(resolve, ms)); return new Promise(resolve => setTimeout(resolve, ms));
} }
async saveCrawledResults(results) { async saveCrawledResults(results, maxAge) {
const savers = this.savers; const savers = this.savers;
for (const saver of savers) { for (const saver of savers) {
await saver.save(results); await saver.save(results, maxAge);
} }
} }
} }

View File

@@ -1,7 +1,7 @@
"use strict"; "use strict";
const db = require("../../models/index"); const db = require("../../models/index");
const bulkUpsertRealEstates = async realEstateData => { const bulkUpsertRealEstates = async (realEstateData, maxAge) => {
try { try {
const fieldsToUpdateIfDuplicate = [ const fieldsToUpdateIfDuplicate = [
"realEstateType", "realEstateType",

View File

@@ -16,8 +16,8 @@ SOURCE_EMAIL=info@saburly.com
#=============== CRAWLER SETTINGS===============# #=============== CRAWLER SETTINGS===============#
#==OLX== #==OLX==
OLX_START_PAGE=Crawler starts from this page OLX_MAX_PAGES=Restrict crawler to this number of pages
OLX_END_PAGE=Crawler ends with this page (including this page)
OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
OLX_MAX_AGE=[in days] if ad is crawled before this number of days, it will be re-crawled