use function generator to index pages; crawl in parallel

This commit is contained in:
Bilal Catic
2019-09-23 10:46:31 +02:00
parent c4f6c6e1c3
commit 3140fdf0c0
6 changed files with 127 additions and 77 deletions

View File

@@ -13,12 +13,12 @@ const PostgresSaver = require("./savers/postgres");
const crawlers = [
new OlxCrawler(
OLX_CONFIG.OLX_START_PAGE,
OLX_CONFIG.OLX_END_PAGE,
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
[new PostgresSaver()],
OLX_CONFIG.OLX_CRAWLER_AD_TYPE,
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES,
OLX_CONFIG.OLX_MAX_PAGES,
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
OLX_CONFIG.OLX_MAX_AGE
)
];

View File

@@ -2,29 +2,29 @@
require("dotenv").config({ path: "../../.env" });
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../common/enums");
const crawlerAdType =
const olxCrawlerAdType =
process.env.OLX_CRAWLER_AD_TYPE !== undefined
? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE]
: null;
const parsedCrawlerAdCategories =
const olxParsedCrawlerAdCategories =
process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined
? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category =>
category.trim()
)
: ["CATEGORY_FLAT", "CATEGORY_HOUSE"];
const transformedCrawlerAdCategories = parsedCrawlerAdCategories
const transformedCrawlerAdCategories = olxParsedCrawlerAdCategories
.map(categoryName => AD_CATEGORY[categoryName])
.filter(category => !!category);
const OLX_CONFIG = {
OLX_START_PAGE: parseInt(process.env.OLX_START_PAGE) || 1,
OLX_END_PAGE: parseInt(process.env.OLX_END_PAGE) || 10,
OLX_MAX_PAGES: parseInt(process.env.MAX_PAGES) || 500,
OLX_MAX_RESULTS_PER_PAGE:
parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50,
OLX_CRAWLER_AD_TYPE: crawlerAdType || CRAWLER_AD_TYPE.NONE,
OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories
OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE,
OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories,
OLX_MAX_AGE: parseInt(process.env.OLX_MAX_AGE) || 30
};
module.exports = {

View File

@@ -7,9 +7,9 @@ class PostgresSaver {
return true;
}
async save(results) {
async save(results, maxAge) {
console.log("[POSTGRES] Saving...");
await bulkUpsertRealEstates(results);
await bulkUpsertRealEstates(results, maxAge);
}
close() {

View File

@@ -1,7 +1,8 @@
"use strict";
let fetch = require("node-fetch");
let cheerio = require("cheerio");
const fetch = require("node-fetch");
const cheerio = require("cheerio");
const Promise = require("bluebird");
const {
AD_TYPE,
@@ -13,78 +14,113 @@ const {
} = require("../../common/enums");
const OLX_ENUMS = {
OLX_AD_TYPE: {},
OLX_AD_CATEGORY: {},
OLX_AD_TYPE: {
[CRAWLER_AD_TYPE.ALL]: "",
[CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja",
[CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje"
},
OLX_AD_CATEGORY: {
[AD_CATEGORY.CATEGORY_FLAT]: "&kategorija=23",
[AD_CATEGORY.CATEGORY_HOUSE]: "&kategorija=24",
[AD_CATEGORY.CATEGORY_LAND]: "&kategorija=29",
[AD_CATEGORY.CATEGORY_OFFICE]: "&kategorija=25",
[AD_CATEGORY.CATEGORY_APARTMENT]: "&kategorija=27",
[AD_CATEGORY.CATEGORY_GARAGE]: "&kategorija=30"
},
MAX_DETAIL_FIELDS: 30
};
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ALL] = "";
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_SELL] = "&vrsta=samoprodaja";
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_RENT] = "&vrsta=samoizdavanje";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_FLAT] = "&kategorija=23";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_HOUSE] = "&kategorija=24";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_LAND] = "&kategorija=29";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_OFFICE] = "&kategorija=25";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_APARTMENT] = "&kategorija=27";
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_GARAGE] = "&kategorija=30";
class OlxCrawler {
constructor(
fromPage = 1,
toPage = 10,
maxResults = 1000,
savers = [],
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
crawlerAdCategories = [
AD_CATEGORY.CATEGORY_FLAT,
AD_CATEGORY.CATEGORY_HOUSE
]
],
maxPages = 1000,
maxResultsPerPage = 100,
maxAge = 30
) {
this.fromPage = fromPage;
this.toPage = toPage;
this.maxResults = maxResults;
this.savers = savers;
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
this.crawlerAdTypes = crawlerAdTypes;
this.crawlerAdCategories = crawlerAdCategories;
this.maxPages = maxPages;
this.maxResultsPerPage = maxResultsPerPage;
this.maxAge = maxAge;
}
async crawl() {
console.log("[OLX] Crawler started");
const crawlAdTypes = this.crawlerAdTypes;
const crawlAdCategories = this.crawlerAdCategories;
const urlWithAdTypeFilter = `${this.baseUrl}${OLX_ENUMS.OLX_AD_TYPE[crawlAdTypes]}`;
if (crawlAdCategories && crawlAdTypes) {
const asyncPagesIndexingByCategory = [];
if (crawlAdCategories) {
const indexGenerators = [];
for (const adCategory of crawlAdCategories) {
asyncPagesIndexingByCategory.push(
this.indexPages(
`${urlWithAdTypeFilter}${OLX_ENUMS.OLX_AD_CATEGORY[adCategory]}`
)
);
indexGenerators.push(this.categoryIndexer(adCategory));
}
await Promise.all(asyncPagesIndexingByCategory);
let done = false;
while (!done) {
const categoryIndexerPromises = [];
for (const indexGenerator of indexGenerators) {
categoryIndexerPromises.push(indexGenerator.next());
}
Promise.all(categoryIndexerPromises).then(singlePageResults => {
const entries = singlePageResults.entries();
for (const [index, { value: singlePageResult }] of entries) {
if (singlePageResult) {
this.saveCrawledResults(singlePageResult, this.maxAge)
.then(numberOfSaved => {})
.catch(error =>
console.log("[POSTGRES Saver] Error saving results : ", error)
);
} else {
//Generator returned undefined, no more pages
indexGenerators.splice(index, 1);
if (indexGenerators.length === 0) {
done = true;
}
}
}
});
await this.sleep(500);
}
}
console.log("[OLX] Crawler finished");
}
async indexPages(url) {
const startPage = this.fromPage;
const endPage = this.toPage;
const maxResultsPerPage = this.maxResults;
async *categoryIndexer(adCategory) {
let pageToIndex = 1;
for (let pageNumber = startPage; pageNumber <= endPage; pageNumber++) {
const pageUrl = `${url}&stranica=${pageNumber}`;
const singlePageResults = await this.indexSinglePage(
pageUrl,
maxResultsPerPage
);
await this.saveCrawledResults(singlePageResults);
await this.sleep(5000);
const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
if (urlAdTypePart && urlCategoryPart) {
while (true) {
const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
const singlePageResults = await this.indexSinglePage(
urlPageToCrawl,
this.maxResultsPerPage
);
console.log("indexing ", adCategory, " page : ", pageToIndex);
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
yield singlePageResults;
} else {
return undefined;
}
++pageToIndex;
if (pageToIndex === this.maxPages) {
return undefined;
}
}
} else {
return undefined;
}
}
@@ -111,18 +147,16 @@ class OlxCrawler {
let actualNoOfResults =
hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage;
const asyncScraping = [];
for (let i = 0; i < actualNoOfResults; i++) {
const adData = await this.scrapeAd(hrefs[i]);
if (adData) {
singlePageResults.push(adData);
}
await this.sleep(500);
asyncScraping.push(this.scrapeAd(hrefs[i]));
}
return singlePageResults;
const scrapedData = await Promise.all(asyncScraping);
return scrapedData;
} catch (e) {
console.error("Exception caught:" + e);
return [];
}
}
@@ -135,24 +169,32 @@ class OlxCrawler {
const username = $(
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span"
).text();
)
.text()
.trim();
if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) {
return null;
}
const title = $("#naslovartikla").text();
const title = $("#naslovartikla")
.text()
.trim();
const descriptions = $(".artikal_detaljniopis_tekst");
const category = $(
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
).text();
)
.text()
.trim();
//====== PRICE DETECTION AND EXTRACTION =====
let price = null;
const normalPriceValue = $("#pc > p:nth-child(2)").text();
const urgentPriceValue = $(
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p"
).text();
)
.text()
.trim();
if (normalPriceValue && normalPriceValue.length > 0) {
price = normalPriceValue;
@@ -258,7 +300,9 @@ class OlxCrawler {
const time = $("time").attr("datetime");
const numberOfViews = $(
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2"
).text();
)
.text()
.trim();
//===========================================
//=========================================
@@ -296,8 +340,14 @@ class OlxCrawler {
price: parsedPrice,
area: parsedArea,
gardenSize: parsedGardenSize,
shortDescription: descriptions.first().text(),
longDescription: descriptions.last().text(),
shortDescription: descriptions
.first()
.text()
.trim(),
longDescription: descriptions
.last()
.text()
.trim(),
streetNumber: 0,
streetName: "",
locality: "",
@@ -370,11 +420,11 @@ class OlxCrawler {
return new Promise(resolve => setTimeout(resolve, ms));
}
async saveCrawledResults(results) {
async saveCrawledResults(results, maxAge) {
const savers = this.savers;
for (const saver of savers) {
await saver.save(results);
await saver.save(results, maxAge);
}
}
}

View File

@@ -1,7 +1,7 @@
"use strict";
const db = require("../../models/index");
const bulkUpsertRealEstates = async realEstateData => {
const bulkUpsertRealEstates = async (realEstateData, maxAge) => {
try {
const fieldsToUpdateIfDuplicate = [
"realEstateType",

View File

@@ -16,8 +16,8 @@ SOURCE_EMAIL=info@saburly.com
#=============== CRAWLER SETTINGS===============#
#==OLX==
OLX_START_PAGE=Crawler starts from this page
OLX_END_PAGE=Crawler ends with this page (including this page)
OLX_MAX_PAGES=Restrict crawler to this number of pages
OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
OLX_MAX_AGE=[in days] if ad is crawled before this number of days, it will be re-crawled