use function generator to index pages; crawl in parallel
This commit is contained in:
@@ -13,12 +13,12 @@ const PostgresSaver = require("./savers/postgres");
|
||||
|
||||
const crawlers = [
|
||||
new OlxCrawler(
|
||||
OLX_CONFIG.OLX_START_PAGE,
|
||||
OLX_CONFIG.OLX_END_PAGE,
|
||||
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
|
||||
[new PostgresSaver()],
|
||||
OLX_CONFIG.OLX_CRAWLER_AD_TYPE,
|
||||
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES
|
||||
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES,
|
||||
OLX_CONFIG.OLX_MAX_PAGES,
|
||||
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
|
||||
OLX_CONFIG.OLX_MAX_AGE
|
||||
)
|
||||
];
|
||||
|
||||
|
||||
@@ -2,29 +2,29 @@
|
||||
require("dotenv").config({ path: "../../.env" });
|
||||
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../common/enums");
|
||||
|
||||
const crawlerAdType =
|
||||
const olxCrawlerAdType =
|
||||
process.env.OLX_CRAWLER_AD_TYPE !== undefined
|
||||
? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE]
|
||||
: null;
|
||||
|
||||
const parsedCrawlerAdCategories =
|
||||
const olxParsedCrawlerAdCategories =
|
||||
process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined
|
||||
? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category =>
|
||||
category.trim()
|
||||
)
|
||||
: ["CATEGORY_FLAT", "CATEGORY_HOUSE"];
|
||||
|
||||
const transformedCrawlerAdCategories = parsedCrawlerAdCategories
|
||||
const transformedCrawlerAdCategories = olxParsedCrawlerAdCategories
|
||||
.map(categoryName => AD_CATEGORY[categoryName])
|
||||
.filter(category => !!category);
|
||||
|
||||
const OLX_CONFIG = {
|
||||
OLX_START_PAGE: parseInt(process.env.OLX_START_PAGE) || 1,
|
||||
OLX_END_PAGE: parseInt(process.env.OLX_END_PAGE) || 10,
|
||||
OLX_MAX_PAGES: parseInt(process.env.MAX_PAGES) || 500,
|
||||
OLX_MAX_RESULTS_PER_PAGE:
|
||||
parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50,
|
||||
OLX_CRAWLER_AD_TYPE: crawlerAdType || CRAWLER_AD_TYPE.NONE,
|
||||
OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories
|
||||
OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE,
|
||||
OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories,
|
||||
OLX_MAX_AGE: parseInt(process.env.OLX_MAX_AGE) || 30
|
||||
};
|
||||
|
||||
module.exports = {
|
||||
|
||||
@@ -7,9 +7,9 @@ class PostgresSaver {
|
||||
return true;
|
||||
}
|
||||
|
||||
async save(results) {
|
||||
async save(results, maxAge) {
|
||||
console.log("[POSTGRES] Saving...");
|
||||
await bulkUpsertRealEstates(results);
|
||||
await bulkUpsertRealEstates(results, maxAge);
|
||||
}
|
||||
|
||||
close() {
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
"use strict";
|
||||
|
||||
let fetch = require("node-fetch");
|
||||
let cheerio = require("cheerio");
|
||||
const fetch = require("node-fetch");
|
||||
const cheerio = require("cheerio");
|
||||
const Promise = require("bluebird");
|
||||
|
||||
const {
|
||||
AD_TYPE,
|
||||
@@ -13,78 +14,113 @@ const {
|
||||
} = require("../../common/enums");
|
||||
|
||||
const OLX_ENUMS = {
|
||||
OLX_AD_TYPE: {},
|
||||
OLX_AD_CATEGORY: {},
|
||||
OLX_AD_TYPE: {
|
||||
[CRAWLER_AD_TYPE.ALL]: "",
|
||||
[CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja",
|
||||
[CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje"
|
||||
},
|
||||
OLX_AD_CATEGORY: {
|
||||
[AD_CATEGORY.CATEGORY_FLAT]: "&kategorija=23",
|
||||
[AD_CATEGORY.CATEGORY_HOUSE]: "&kategorija=24",
|
||||
[AD_CATEGORY.CATEGORY_LAND]: "&kategorija=29",
|
||||
[AD_CATEGORY.CATEGORY_OFFICE]: "&kategorija=25",
|
||||
[AD_CATEGORY.CATEGORY_APARTMENT]: "&kategorija=27",
|
||||
[AD_CATEGORY.CATEGORY_GARAGE]: "&kategorija=30"
|
||||
},
|
||||
MAX_DETAIL_FIELDS: 30
|
||||
};
|
||||
|
||||
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ALL] = "";
|
||||
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_SELL] = "&vrsta=samoprodaja";
|
||||
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_RENT] = "&vrsta=samoizdavanje";
|
||||
|
||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_FLAT] = "&kategorija=23";
|
||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_HOUSE] = "&kategorija=24";
|
||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_LAND] = "&kategorija=29";
|
||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_OFFICE] = "&kategorija=25";
|
||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_APARTMENT] = "&kategorija=27";
|
||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_GARAGE] = "&kategorija=30";
|
||||
|
||||
class OlxCrawler {
|
||||
constructor(
|
||||
fromPage = 1,
|
||||
toPage = 10,
|
||||
maxResults = 1000,
|
||||
savers = [],
|
||||
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
|
||||
crawlerAdCategories = [
|
||||
AD_CATEGORY.CATEGORY_FLAT,
|
||||
AD_CATEGORY.CATEGORY_HOUSE
|
||||
]
|
||||
],
|
||||
maxPages = 1000,
|
||||
maxResultsPerPage = 100,
|
||||
maxAge = 30
|
||||
) {
|
||||
this.fromPage = fromPage;
|
||||
this.toPage = toPage;
|
||||
this.maxResults = maxResults;
|
||||
this.savers = savers;
|
||||
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
|
||||
this.crawlerAdTypes = crawlerAdTypes;
|
||||
this.crawlerAdCategories = crawlerAdCategories;
|
||||
this.maxPages = maxPages;
|
||||
this.maxResultsPerPage = maxResultsPerPage;
|
||||
this.maxAge = maxAge;
|
||||
}
|
||||
|
||||
async crawl() {
|
||||
console.log("[OLX] Crawler started");
|
||||
const crawlAdTypes = this.crawlerAdTypes;
|
||||
const crawlAdCategories = this.crawlerAdCategories;
|
||||
|
||||
const urlWithAdTypeFilter = `${this.baseUrl}${OLX_ENUMS.OLX_AD_TYPE[crawlAdTypes]}`;
|
||||
|
||||
if (crawlAdCategories && crawlAdTypes) {
|
||||
const asyncPagesIndexingByCategory = [];
|
||||
if (crawlAdCategories) {
|
||||
const indexGenerators = [];
|
||||
for (const adCategory of crawlAdCategories) {
|
||||
asyncPagesIndexingByCategory.push(
|
||||
this.indexPages(
|
||||
`${urlWithAdTypeFilter}${OLX_ENUMS.OLX_AD_CATEGORY[adCategory]}`
|
||||
)
|
||||
);
|
||||
indexGenerators.push(this.categoryIndexer(adCategory));
|
||||
}
|
||||
|
||||
await Promise.all(asyncPagesIndexingByCategory);
|
||||
let done = false;
|
||||
while (!done) {
|
||||
const categoryIndexerPromises = [];
|
||||
for (const indexGenerator of indexGenerators) {
|
||||
categoryIndexerPromises.push(indexGenerator.next());
|
||||
}
|
||||
|
||||
Promise.all(categoryIndexerPromises).then(singlePageResults => {
|
||||
const entries = singlePageResults.entries();
|
||||
for (const [index, { value: singlePageResult }] of entries) {
|
||||
if (singlePageResult) {
|
||||
this.saveCrawledResults(singlePageResult, this.maxAge)
|
||||
.then(numberOfSaved => {})
|
||||
.catch(error =>
|
||||
console.log("[POSTGRES Saver] Error saving results : ", error)
|
||||
);
|
||||
} else {
|
||||
//Generator returned undefined, no more pages
|
||||
indexGenerators.splice(index, 1);
|
||||
if (indexGenerators.length === 0) {
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
await this.sleep(500);
|
||||
}
|
||||
}
|
||||
|
||||
console.log("[OLX] Crawler finished");
|
||||
}
|
||||
|
||||
async indexPages(url) {
|
||||
const startPage = this.fromPage;
|
||||
const endPage = this.toPage;
|
||||
const maxResultsPerPage = this.maxResults;
|
||||
async *categoryIndexer(adCategory) {
|
||||
let pageToIndex = 1;
|
||||
|
||||
for (let pageNumber = startPage; pageNumber <= endPage; pageNumber++) {
|
||||
const pageUrl = `${url}&stranica=${pageNumber}`;
|
||||
const singlePageResults = await this.indexSinglePage(
|
||||
pageUrl,
|
||||
maxResultsPerPage
|
||||
);
|
||||
await this.saveCrawledResults(singlePageResults);
|
||||
await this.sleep(5000);
|
||||
const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
|
||||
const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
|
||||
if (urlAdTypePart && urlCategoryPart) {
|
||||
while (true) {
|
||||
const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
|
||||
const singlePageResults = await this.indexSinglePage(
|
||||
urlPageToCrawl,
|
||||
this.maxResultsPerPage
|
||||
);
|
||||
console.log("indexing ", adCategory, " page : ", pageToIndex);
|
||||
|
||||
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
|
||||
yield singlePageResults;
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
++pageToIndex;
|
||||
if (pageToIndex === this.maxPages) {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -111,18 +147,16 @@ class OlxCrawler {
|
||||
let actualNoOfResults =
|
||||
hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage;
|
||||
|
||||
const asyncScraping = [];
|
||||
for (let i = 0; i < actualNoOfResults; i++) {
|
||||
const adData = await this.scrapeAd(hrefs[i]);
|
||||
|
||||
if (adData) {
|
||||
singlePageResults.push(adData);
|
||||
}
|
||||
await this.sleep(500);
|
||||
asyncScraping.push(this.scrapeAd(hrefs[i]));
|
||||
}
|
||||
|
||||
return singlePageResults;
|
||||
const scrapedData = await Promise.all(asyncScraping);
|
||||
return scrapedData;
|
||||
} catch (e) {
|
||||
console.error("Exception caught:" + e);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,24 +169,32 @@ class OlxCrawler {
|
||||
|
||||
const username = $(
|
||||
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span"
|
||||
).text();
|
||||
)
|
||||
.text()
|
||||
.trim();
|
||||
|
||||
if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const title = $("#naslovartikla").text();
|
||||
const title = $("#naslovartikla")
|
||||
.text()
|
||||
.trim();
|
||||
const descriptions = $(".artikal_detaljniopis_tekst");
|
||||
const category = $(
|
||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
|
||||
).text();
|
||||
)
|
||||
.text()
|
||||
.trim();
|
||||
|
||||
//====== PRICE DETECTION AND EXTRACTION =====
|
||||
let price = null;
|
||||
const normalPriceValue = $("#pc > p:nth-child(2)").text();
|
||||
const urgentPriceValue = $(
|
||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p"
|
||||
).text();
|
||||
)
|
||||
.text()
|
||||
.trim();
|
||||
|
||||
if (normalPriceValue && normalPriceValue.length > 0) {
|
||||
price = normalPriceValue;
|
||||
@@ -258,7 +300,9 @@ class OlxCrawler {
|
||||
const time = $("time").attr("datetime");
|
||||
const numberOfViews = $(
|
||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2"
|
||||
).text();
|
||||
)
|
||||
.text()
|
||||
.trim();
|
||||
//===========================================
|
||||
|
||||
//=========================================
|
||||
@@ -296,8 +340,14 @@ class OlxCrawler {
|
||||
price: parsedPrice,
|
||||
area: parsedArea,
|
||||
gardenSize: parsedGardenSize,
|
||||
shortDescription: descriptions.first().text(),
|
||||
longDescription: descriptions.last().text(),
|
||||
shortDescription: descriptions
|
||||
.first()
|
||||
.text()
|
||||
.trim(),
|
||||
longDescription: descriptions
|
||||
.last()
|
||||
.text()
|
||||
.trim(),
|
||||
streetNumber: 0,
|
||||
streetName: "",
|
||||
locality: "",
|
||||
@@ -370,11 +420,11 @@ class OlxCrawler {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async saveCrawledResults(results) {
|
||||
async saveCrawledResults(results, maxAge) {
|
||||
const savers = this.savers;
|
||||
|
||||
for (const saver of savers) {
|
||||
await saver.save(results);
|
||||
await saver.save(results, maxAge);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"use strict";
|
||||
const db = require("../../models/index");
|
||||
|
||||
const bulkUpsertRealEstates = async realEstateData => {
|
||||
const bulkUpsertRealEstates = async (realEstateData, maxAge) => {
|
||||
try {
|
||||
const fieldsToUpdateIfDuplicate = [
|
||||
"realEstateType",
|
||||
|
||||
@@ -16,8 +16,8 @@ SOURCE_EMAIL=info@saburly.com
|
||||
|
||||
#=============== CRAWLER SETTINGS===============#
|
||||
#==OLX==
|
||||
OLX_START_PAGE=Crawler starts from this page
|
||||
OLX_END_PAGE=Crawler ends with this page (including this page)
|
||||
OLX_MAX_PAGES=Restrict crawler to this number of pages
|
||||
OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
|
||||
OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
|
||||
OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
|
||||
OLX_MAX_AGE=[in days] if ad is crawled before this number of days, it will be re-crawled
|
||||
|
||||
Reference in New Issue
Block a user