use function generator to index pages; crawl in parallel
This commit is contained in:
@@ -13,12 +13,12 @@ const PostgresSaver = require("./savers/postgres");
|
|||||||
|
|
||||||
const crawlers = [
|
const crawlers = [
|
||||||
new OlxCrawler(
|
new OlxCrawler(
|
||||||
OLX_CONFIG.OLX_START_PAGE,
|
|
||||||
OLX_CONFIG.OLX_END_PAGE,
|
|
||||||
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
|
|
||||||
[new PostgresSaver()],
|
[new PostgresSaver()],
|
||||||
OLX_CONFIG.OLX_CRAWLER_AD_TYPE,
|
OLX_CONFIG.OLX_CRAWLER_AD_TYPE,
|
||||||
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES
|
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES,
|
||||||
|
OLX_CONFIG.OLX_MAX_PAGES,
|
||||||
|
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
|
||||||
|
OLX_CONFIG.OLX_MAX_AGE
|
||||||
)
|
)
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|||||||
@@ -2,29 +2,29 @@
|
|||||||
require("dotenv").config({ path: "../../.env" });
|
require("dotenv").config({ path: "../../.env" });
|
||||||
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../common/enums");
|
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../common/enums");
|
||||||
|
|
||||||
const crawlerAdType =
|
const olxCrawlerAdType =
|
||||||
process.env.OLX_CRAWLER_AD_TYPE !== undefined
|
process.env.OLX_CRAWLER_AD_TYPE !== undefined
|
||||||
? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE]
|
? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE]
|
||||||
: null;
|
: null;
|
||||||
|
|
||||||
const parsedCrawlerAdCategories =
|
const olxParsedCrawlerAdCategories =
|
||||||
process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined
|
process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined
|
||||||
? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category =>
|
? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category =>
|
||||||
category.trim()
|
category.trim()
|
||||||
)
|
)
|
||||||
: ["CATEGORY_FLAT", "CATEGORY_HOUSE"];
|
: ["CATEGORY_FLAT", "CATEGORY_HOUSE"];
|
||||||
|
|
||||||
const transformedCrawlerAdCategories = parsedCrawlerAdCategories
|
const transformedCrawlerAdCategories = olxParsedCrawlerAdCategories
|
||||||
.map(categoryName => AD_CATEGORY[categoryName])
|
.map(categoryName => AD_CATEGORY[categoryName])
|
||||||
.filter(category => !!category);
|
.filter(category => !!category);
|
||||||
|
|
||||||
const OLX_CONFIG = {
|
const OLX_CONFIG = {
|
||||||
OLX_START_PAGE: parseInt(process.env.OLX_START_PAGE) || 1,
|
OLX_MAX_PAGES: parseInt(process.env.MAX_PAGES) || 500,
|
||||||
OLX_END_PAGE: parseInt(process.env.OLX_END_PAGE) || 10,
|
|
||||||
OLX_MAX_RESULTS_PER_PAGE:
|
OLX_MAX_RESULTS_PER_PAGE:
|
||||||
parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50,
|
parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50,
|
||||||
OLX_CRAWLER_AD_TYPE: crawlerAdType || CRAWLER_AD_TYPE.NONE,
|
OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE,
|
||||||
OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories
|
OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories,
|
||||||
|
OLX_MAX_AGE: parseInt(process.env.OLX_MAX_AGE) || 30
|
||||||
};
|
};
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
|||||||
@@ -7,9 +7,9 @@ class PostgresSaver {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
async save(results) {
|
async save(results, maxAge) {
|
||||||
console.log("[POSTGRES] Saving...");
|
console.log("[POSTGRES] Saving...");
|
||||||
await bulkUpsertRealEstates(results);
|
await bulkUpsertRealEstates(results, maxAge);
|
||||||
}
|
}
|
||||||
|
|
||||||
close() {
|
close() {
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
"use strict";
|
"use strict";
|
||||||
|
|
||||||
let fetch = require("node-fetch");
|
const fetch = require("node-fetch");
|
||||||
let cheerio = require("cheerio");
|
const cheerio = require("cheerio");
|
||||||
|
const Promise = require("bluebird");
|
||||||
|
|
||||||
const {
|
const {
|
||||||
AD_TYPE,
|
AD_TYPE,
|
||||||
@@ -13,78 +14,113 @@ const {
|
|||||||
} = require("../../common/enums");
|
} = require("../../common/enums");
|
||||||
|
|
||||||
const OLX_ENUMS = {
|
const OLX_ENUMS = {
|
||||||
OLX_AD_TYPE: {},
|
OLX_AD_TYPE: {
|
||||||
OLX_AD_CATEGORY: {},
|
[CRAWLER_AD_TYPE.ALL]: "",
|
||||||
|
[CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja",
|
||||||
|
[CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje"
|
||||||
|
},
|
||||||
|
OLX_AD_CATEGORY: {
|
||||||
|
[AD_CATEGORY.CATEGORY_FLAT]: "&kategorija=23",
|
||||||
|
[AD_CATEGORY.CATEGORY_HOUSE]: "&kategorija=24",
|
||||||
|
[AD_CATEGORY.CATEGORY_LAND]: "&kategorija=29",
|
||||||
|
[AD_CATEGORY.CATEGORY_OFFICE]: "&kategorija=25",
|
||||||
|
[AD_CATEGORY.CATEGORY_APARTMENT]: "&kategorija=27",
|
||||||
|
[AD_CATEGORY.CATEGORY_GARAGE]: "&kategorija=30"
|
||||||
|
},
|
||||||
MAX_DETAIL_FIELDS: 30
|
MAX_DETAIL_FIELDS: 30
|
||||||
};
|
};
|
||||||
|
|
||||||
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ALL] = "";
|
|
||||||
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_SELL] = "&vrsta=samoprodaja";
|
|
||||||
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_RENT] = "&vrsta=samoizdavanje";
|
|
||||||
|
|
||||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_FLAT] = "&kategorija=23";
|
|
||||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_HOUSE] = "&kategorija=24";
|
|
||||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_LAND] = "&kategorija=29";
|
|
||||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_OFFICE] = "&kategorija=25";
|
|
||||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_APARTMENT] = "&kategorija=27";
|
|
||||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_GARAGE] = "&kategorija=30";
|
|
||||||
|
|
||||||
class OlxCrawler {
|
class OlxCrawler {
|
||||||
constructor(
|
constructor(
|
||||||
fromPage = 1,
|
|
||||||
toPage = 10,
|
|
||||||
maxResults = 1000,
|
|
||||||
savers = [],
|
savers = [],
|
||||||
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
|
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
|
||||||
crawlerAdCategories = [
|
crawlerAdCategories = [
|
||||||
AD_CATEGORY.CATEGORY_FLAT,
|
AD_CATEGORY.CATEGORY_FLAT,
|
||||||
AD_CATEGORY.CATEGORY_HOUSE
|
AD_CATEGORY.CATEGORY_HOUSE
|
||||||
]
|
],
|
||||||
|
maxPages = 1000,
|
||||||
|
maxResultsPerPage = 100,
|
||||||
|
maxAge = 30
|
||||||
) {
|
) {
|
||||||
this.fromPage = fromPage;
|
|
||||||
this.toPage = toPage;
|
|
||||||
this.maxResults = maxResults;
|
|
||||||
this.savers = savers;
|
this.savers = savers;
|
||||||
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
|
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
|
||||||
this.crawlerAdTypes = crawlerAdTypes;
|
this.crawlerAdTypes = crawlerAdTypes;
|
||||||
this.crawlerAdCategories = crawlerAdCategories;
|
this.crawlerAdCategories = crawlerAdCategories;
|
||||||
|
this.maxPages = maxPages;
|
||||||
|
this.maxResultsPerPage = maxResultsPerPage;
|
||||||
|
this.maxAge = maxAge;
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawl() {
|
async crawl() {
|
||||||
console.log("[OLX] Crawler started");
|
console.log("[OLX] Crawler started");
|
||||||
const crawlAdTypes = this.crawlerAdTypes;
|
|
||||||
const crawlAdCategories = this.crawlerAdCategories;
|
const crawlAdCategories = this.crawlerAdCategories;
|
||||||
|
|
||||||
const urlWithAdTypeFilter = `${this.baseUrl}${OLX_ENUMS.OLX_AD_TYPE[crawlAdTypes]}`;
|
if (crawlAdCategories) {
|
||||||
|
const indexGenerators = [];
|
||||||
if (crawlAdCategories && crawlAdTypes) {
|
|
||||||
const asyncPagesIndexingByCategory = [];
|
|
||||||
for (const adCategory of crawlAdCategories) {
|
for (const adCategory of crawlAdCategories) {
|
||||||
asyncPagesIndexingByCategory.push(
|
indexGenerators.push(this.categoryIndexer(adCategory));
|
||||||
this.indexPages(
|
|
||||||
`${urlWithAdTypeFilter}${OLX_ENUMS.OLX_AD_CATEGORY[adCategory]}`
|
|
||||||
)
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
await Promise.all(asyncPagesIndexingByCategory);
|
let done = false;
|
||||||
|
while (!done) {
|
||||||
|
const categoryIndexerPromises = [];
|
||||||
|
for (const indexGenerator of indexGenerators) {
|
||||||
|
categoryIndexerPromises.push(indexGenerator.next());
|
||||||
|
}
|
||||||
|
|
||||||
|
Promise.all(categoryIndexerPromises).then(singlePageResults => {
|
||||||
|
const entries = singlePageResults.entries();
|
||||||
|
for (const [index, { value: singlePageResult }] of entries) {
|
||||||
|
if (singlePageResult) {
|
||||||
|
this.saveCrawledResults(singlePageResult, this.maxAge)
|
||||||
|
.then(numberOfSaved => {})
|
||||||
|
.catch(error =>
|
||||||
|
console.log("[POSTGRES Saver] Error saving results : ", error)
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
//Generator returned undefined, no more pages
|
||||||
|
indexGenerators.splice(index, 1);
|
||||||
|
if (indexGenerators.length === 0) {
|
||||||
|
done = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
await this.sleep(500);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log("[OLX] Crawler finished");
|
console.log("[OLX] Crawler finished");
|
||||||
}
|
}
|
||||||
|
|
||||||
async indexPages(url) {
|
async *categoryIndexer(adCategory) {
|
||||||
const startPage = this.fromPage;
|
let pageToIndex = 1;
|
||||||
const endPage = this.toPage;
|
|
||||||
const maxResultsPerPage = this.maxResults;
|
|
||||||
|
|
||||||
for (let pageNumber = startPage; pageNumber <= endPage; pageNumber++) {
|
const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
|
||||||
const pageUrl = `${url}&stranica=${pageNumber}`;
|
const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
|
||||||
const singlePageResults = await this.indexSinglePage(
|
if (urlAdTypePart && urlCategoryPart) {
|
||||||
pageUrl,
|
while (true) {
|
||||||
maxResultsPerPage
|
const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
|
||||||
);
|
const singlePageResults = await this.indexSinglePage(
|
||||||
await this.saveCrawledResults(singlePageResults);
|
urlPageToCrawl,
|
||||||
await this.sleep(5000);
|
this.maxResultsPerPage
|
||||||
|
);
|
||||||
|
console.log("indexing ", adCategory, " page : ", pageToIndex);
|
||||||
|
|
||||||
|
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
|
||||||
|
yield singlePageResults;
|
||||||
|
} else {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
++pageToIndex;
|
||||||
|
if (pageToIndex === this.maxPages) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return undefined;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -111,18 +147,16 @@ class OlxCrawler {
|
|||||||
let actualNoOfResults =
|
let actualNoOfResults =
|
||||||
hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage;
|
hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage;
|
||||||
|
|
||||||
|
const asyncScraping = [];
|
||||||
for (let i = 0; i < actualNoOfResults; i++) {
|
for (let i = 0; i < actualNoOfResults; i++) {
|
||||||
const adData = await this.scrapeAd(hrefs[i]);
|
asyncScraping.push(this.scrapeAd(hrefs[i]));
|
||||||
|
|
||||||
if (adData) {
|
|
||||||
singlePageResults.push(adData);
|
|
||||||
}
|
|
||||||
await this.sleep(500);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return singlePageResults;
|
const scrapedData = await Promise.all(asyncScraping);
|
||||||
|
return scrapedData;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error("Exception caught:" + e);
|
console.error("Exception caught:" + e);
|
||||||
|
return [];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -135,24 +169,32 @@ class OlxCrawler {
|
|||||||
|
|
||||||
const username = $(
|
const username = $(
|
||||||
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span"
|
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span"
|
||||||
).text();
|
)
|
||||||
|
.text()
|
||||||
|
.trim();
|
||||||
|
|
||||||
if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) {
|
if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
const title = $("#naslovartikla").text();
|
const title = $("#naslovartikla")
|
||||||
|
.text()
|
||||||
|
.trim();
|
||||||
const descriptions = $(".artikal_detaljniopis_tekst");
|
const descriptions = $(".artikal_detaljniopis_tekst");
|
||||||
const category = $(
|
const category = $(
|
||||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
|
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
|
||||||
).text();
|
)
|
||||||
|
.text()
|
||||||
|
.trim();
|
||||||
|
|
||||||
//====== PRICE DETECTION AND EXTRACTION =====
|
//====== PRICE DETECTION AND EXTRACTION =====
|
||||||
let price = null;
|
let price = null;
|
||||||
const normalPriceValue = $("#pc > p:nth-child(2)").text();
|
const normalPriceValue = $("#pc > p:nth-child(2)").text();
|
||||||
const urgentPriceValue = $(
|
const urgentPriceValue = $(
|
||||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p"
|
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p"
|
||||||
).text();
|
)
|
||||||
|
.text()
|
||||||
|
.trim();
|
||||||
|
|
||||||
if (normalPriceValue && normalPriceValue.length > 0) {
|
if (normalPriceValue && normalPriceValue.length > 0) {
|
||||||
price = normalPriceValue;
|
price = normalPriceValue;
|
||||||
@@ -258,7 +300,9 @@ class OlxCrawler {
|
|||||||
const time = $("time").attr("datetime");
|
const time = $("time").attr("datetime");
|
||||||
const numberOfViews = $(
|
const numberOfViews = $(
|
||||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2"
|
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2"
|
||||||
).text();
|
)
|
||||||
|
.text()
|
||||||
|
.trim();
|
||||||
//===========================================
|
//===========================================
|
||||||
|
|
||||||
//=========================================
|
//=========================================
|
||||||
@@ -296,8 +340,14 @@ class OlxCrawler {
|
|||||||
price: parsedPrice,
|
price: parsedPrice,
|
||||||
area: parsedArea,
|
area: parsedArea,
|
||||||
gardenSize: parsedGardenSize,
|
gardenSize: parsedGardenSize,
|
||||||
shortDescription: descriptions.first().text(),
|
shortDescription: descriptions
|
||||||
longDescription: descriptions.last().text(),
|
.first()
|
||||||
|
.text()
|
||||||
|
.trim(),
|
||||||
|
longDescription: descriptions
|
||||||
|
.last()
|
||||||
|
.text()
|
||||||
|
.trim(),
|
||||||
streetNumber: 0,
|
streetNumber: 0,
|
||||||
streetName: "",
|
streetName: "",
|
||||||
locality: "",
|
locality: "",
|
||||||
@@ -370,11 +420,11 @@ class OlxCrawler {
|
|||||||
return new Promise(resolve => setTimeout(resolve, ms));
|
return new Promise(resolve => setTimeout(resolve, ms));
|
||||||
}
|
}
|
||||||
|
|
||||||
async saveCrawledResults(results) {
|
async saveCrawledResults(results, maxAge) {
|
||||||
const savers = this.savers;
|
const savers = this.savers;
|
||||||
|
|
||||||
for (const saver of savers) {
|
for (const saver of savers) {
|
||||||
await saver.save(results);
|
await saver.save(results, maxAge);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
"use strict";
|
"use strict";
|
||||||
const db = require("../../models/index");
|
const db = require("../../models/index");
|
||||||
|
|
||||||
const bulkUpsertRealEstates = async realEstateData => {
|
const bulkUpsertRealEstates = async (realEstateData, maxAge) => {
|
||||||
try {
|
try {
|
||||||
const fieldsToUpdateIfDuplicate = [
|
const fieldsToUpdateIfDuplicate = [
|
||||||
"realEstateType",
|
"realEstateType",
|
||||||
|
|||||||
@@ -16,8 +16,8 @@ SOURCE_EMAIL=info@saburly.com
|
|||||||
|
|
||||||
#=============== CRAWLER SETTINGS===============#
|
#=============== CRAWLER SETTINGS===============#
|
||||||
#==OLX==
|
#==OLX==
|
||||||
OLX_START_PAGE=Crawler starts from this page
|
OLX_MAX_PAGES=Restrict crawler to this number of pages
|
||||||
OLX_END_PAGE=Crawler ends with this page (including this page)
|
|
||||||
OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
|
OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
|
||||||
OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
|
OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
|
||||||
OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
|
OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
|
||||||
|
OLX_MAX_AGE=[in days] if ad is crawled before this number of days, it will be re-crawled
|
||||||
|
|||||||
Reference in New Issue
Block a user