use function generator to index pages; crawl in parallel

This commit is contained in:
Bilal Catic
2019-09-23 10:46:31 +02:00
parent c4f6c6e1c3
commit 3140fdf0c0
6 changed files with 127 additions and 77 deletions

View File

@@ -1,7 +1,8 @@
"use strict";
let fetch = require("node-fetch");
let cheerio = require("cheerio");
const fetch = require("node-fetch");
const cheerio = require("cheerio");
const Promise = require("bluebird");
const {
AD_TYPE,
@@ -13,78 +14,113 @@ const {
} = require("../../common/enums");
const OLX_ENUMS = {
OLX_AD_TYPE: {},
OLX_AD_CATEGORY: {},
OLX_AD_TYPE: {
[CRAWLER_AD_TYPE.ALL]: "",
[CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja",
[CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje"
},
OLX_AD_CATEGORY: {
[AD_CATEGORY.CATEGORY_FLAT]: "&kategorija=23",
[AD_CATEGORY.CATEGORY_HOUSE]: "&kategorija=24",
[AD_CATEGORY.CATEGORY_LAND]: "&kategorija=29",
[AD_CATEGORY.CATEGORY_OFFICE]: "&kategorija=25",
[AD_CATEGORY.CATEGORY_APARTMENT]: "&kategorija=27",
[AD_CATEGORY.CATEGORY_GARAGE]: "&kategorija=30"
},
MAX_DETAIL_FIELDS: 30
};
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ALL] = "";
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_SELL] = "&vrsta=samoprodaja";
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_RENT] = "&vrsta=samoizdavanje";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_FLAT] = "&kategorija=23";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_HOUSE] = "&kategorija=24";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_LAND] = "&kategorija=29";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_OFFICE] = "&kategorija=25";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_APARTMENT] = "&kategorija=27";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_GARAGE] = "&kategorija=30";
class OlxCrawler {
constructor(
fromPage = 1,
toPage = 10,
maxResults = 1000,
savers = [],
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
crawlerAdCategories = [
AD_CATEGORY.CATEGORY_FLAT,
AD_CATEGORY.CATEGORY_HOUSE
]
],
maxPages = 1000,
maxResultsPerPage = 100,
maxAge = 30
) {
this.fromPage = fromPage;
this.toPage = toPage;
this.maxResults = maxResults;
this.savers = savers;
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
this.crawlerAdTypes = crawlerAdTypes;
this.crawlerAdCategories = crawlerAdCategories;
this.maxPages = maxPages;
this.maxResultsPerPage = maxResultsPerPage;
this.maxAge = maxAge;
}
async crawl() {
console.log("[OLX] Crawler started");
const crawlAdTypes = this.crawlerAdTypes;
const crawlAdCategories = this.crawlerAdCategories;
const urlWithAdTypeFilter = `${this.baseUrl}${OLX_ENUMS.OLX_AD_TYPE[crawlAdTypes]}`;
if (crawlAdCategories && crawlAdTypes) {
const asyncPagesIndexingByCategory = [];
if (crawlAdCategories) {
const indexGenerators = [];
for (const adCategory of crawlAdCategories) {
asyncPagesIndexingByCategory.push(
this.indexPages(
`${urlWithAdTypeFilter}${OLX_ENUMS.OLX_AD_CATEGORY[adCategory]}`
)
);
indexGenerators.push(this.categoryIndexer(adCategory));
}
await Promise.all(asyncPagesIndexingByCategory);
let done = false;
while (!done) {
const categoryIndexerPromises = [];
for (const indexGenerator of indexGenerators) {
categoryIndexerPromises.push(indexGenerator.next());
}
Promise.all(categoryIndexerPromises).then(singlePageResults => {
const entries = singlePageResults.entries();
for (const [index, { value: singlePageResult }] of entries) {
if (singlePageResult) {
this.saveCrawledResults(singlePageResult, this.maxAge)
.then(numberOfSaved => {})
.catch(error =>
console.log("[POSTGRES Saver] Error saving results : ", error)
);
} else {
//Generator returned undefined, no more pages
indexGenerators.splice(index, 1);
if (indexGenerators.length === 0) {
done = true;
}
}
}
});
await this.sleep(500);
}
}
console.log("[OLX] Crawler finished");
}
async indexPages(url) {
const startPage = this.fromPage;
const endPage = this.toPage;
const maxResultsPerPage = this.maxResults;
async *categoryIndexer(adCategory) {
let pageToIndex = 1;
for (let pageNumber = startPage; pageNumber <= endPage; pageNumber++) {
const pageUrl = `${url}&stranica=${pageNumber}`;
const singlePageResults = await this.indexSinglePage(
pageUrl,
maxResultsPerPage
);
await this.saveCrawledResults(singlePageResults);
await this.sleep(5000);
const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
if (urlAdTypePart && urlCategoryPart) {
while (true) {
const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
const singlePageResults = await this.indexSinglePage(
urlPageToCrawl,
this.maxResultsPerPage
);
console.log("indexing ", adCategory, " page : ", pageToIndex);
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
yield singlePageResults;
} else {
return undefined;
}
++pageToIndex;
if (pageToIndex === this.maxPages) {
return undefined;
}
}
} else {
return undefined;
}
}
@@ -111,18 +147,16 @@ class OlxCrawler {
let actualNoOfResults =
hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage;
const asyncScraping = [];
for (let i = 0; i < actualNoOfResults; i++) {
const adData = await this.scrapeAd(hrefs[i]);
if (adData) {
singlePageResults.push(adData);
}
await this.sleep(500);
asyncScraping.push(this.scrapeAd(hrefs[i]));
}
return singlePageResults;
const scrapedData = await Promise.all(asyncScraping);
return scrapedData;
} catch (e) {
console.error("Exception caught:" + e);
return [];
}
}
@@ -135,24 +169,32 @@ class OlxCrawler {
const username = $(
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span"
).text();
)
.text()
.trim();
if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) {
return null;
}
const title = $("#naslovartikla").text();
const title = $("#naslovartikla")
.text()
.trim();
const descriptions = $(".artikal_detaljniopis_tekst");
const category = $(
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
).text();
)
.text()
.trim();
//====== PRICE DETECTION AND EXTRACTION =====
let price = null;
const normalPriceValue = $("#pc > p:nth-child(2)").text();
const urgentPriceValue = $(
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p"
).text();
)
.text()
.trim();
if (normalPriceValue && normalPriceValue.length > 0) {
price = normalPriceValue;
@@ -258,7 +300,9 @@ class OlxCrawler {
const time = $("time").attr("datetime");
const numberOfViews = $(
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2"
).text();
)
.text()
.trim();
//===========================================
//=========================================
@@ -296,8 +340,14 @@ class OlxCrawler {
price: parsedPrice,
area: parsedArea,
gardenSize: parsedGardenSize,
shortDescription: descriptions.first().text(),
longDescription: descriptions.last().text(),
shortDescription: descriptions
.first()
.text()
.trim(),
longDescription: descriptions
.last()
.text()
.trim(),
streetNumber: 0,
streetName: "",
locality: "",
@@ -370,11 +420,11 @@ class OlxCrawler {
return new Promise(resolve => setTimeout(resolve, ms));
}
async saveCrawledResults(results) {
async saveCrawledResults(results, maxAge) {
const savers = this.savers;
for (const saver of savers) {
await saver.save(results);
await saver.save(results, maxAge);
}
}
}