stop crawling when existing, not renewed ad is found

This commit is contained in:
Bilal Catic
2019-09-25 08:54:33 +02:00
parent b3fcc6ba9a
commit c9a959f8be
2 changed files with 54 additions and 44 deletions

View File

@@ -18,18 +18,23 @@ const crawlers = [
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES,
OLX_CONFIG.OLX_MAX_PAGES,
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
OLX_CONFIG.OLX_IGNORED_USERNAMES
OLX_CONFIG.OLX_IGNORED_USERNAMES,
OLX_CONFIG.OLX_DELAY_BETWEEN_PAGES
)
];
async function crawlAll() {
for (let crawler of crawlers) {
try {
await crawler.crawl();
const newRealEstates = await crawler.crawl();
console.log("Number of new real estates : ", newRealEstates.length);
} catch (e) {
console.log("Error crawling. Trying next crawler! ", e);
}
}
}
crawlAll();
(async () => {
await crawlAll();
})();

View File

@@ -44,7 +44,8 @@ class OlxCrawler {
],
maxPages = 1000,
maxResultsPerPage = 100,
ignoredUsernames = []
ignoredUsernames = [],
delayBetweenPages = 1000
) {
this.savers = savers;
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
@@ -53,14 +54,14 @@ class OlxCrawler {
this.maxPages = maxPages;
this.maxResultsPerPage = maxResultsPerPage;
this.ignoredUsernames = ignoredUsernames;
this.delayBetweenPages = delayBetweenPages;
}
async crawl() {
console.log("[OLX] Crawler started");
const crawlAdCategories = this.crawlerAdCategories;
const savedRealEstates = [];
const asyncSaveActions = [];
const newRealEstates = [];
if (crawlAdCategories) {
const indexGenerators = [];
@@ -71,58 +72,62 @@ class OlxCrawler {
let done = false;
while (!done) {
const categoryIndexerPromises = [];
const generatorsToRemove = [];
for (const indexGenerator of indexGenerators) {
categoryIndexerPromises.push(indexGenerator.next());
generatorsToRemove.push(false);
}
Promise.all(categoryIndexerPromises).then(singlePageResults => {
const entries = singlePageResults.entries();
for (const [index, { value: singlePageResult }] of entries) {
if (singlePageResult) {
const savePromise = this.saveCrawledResults(singlePageResult)
.then(({ exist, savedRecords }) => {
if (exist) {
indexGenerators.splice(index, 1);
if (indexGenerators.length === 0) {
done = true;
}
}
const singlePageResults = await Promise.all(categoryIndexerPromises);
const entries = singlePageResults.entries();
for (const savedRecord of savedRecords) {
const { createdAt, updatedAt } = savedRecord;
for (const [index, { value: singlePageResult }] of entries) {
if (singlePageResult) {
const saveResults = await this.saveCrawledResults(singlePageResult);
const { newRecords, existingRecords } = saveResults;
console.log("Comparing ", createdAt, " <> ", updatedAt);
newRealEstates.push(...newRecords);
const createdAtMoment = moment.utc(createdAt);
const updatedAtMoment = moment.utc(updatedAt);
for (const existingRecord of existingRecords) {
const { publishedDate, renewedDate } = existingRecord;
if (createdAtMoment.isSame(updatedAtMoment, "second")) {
console.log("\tEqual !");
savedRealEstates.push(savedRecord);
}
}
})
.catch(error =>
console.log("[POSTGRES Saver] Error saving results : ", error)
);
asyncSaveActions.push(savePromise);
} else {
//Generator returned undefined, no more pages
indexGenerators.splice(index, 1);
if (indexGenerators.length === 0) {
done = true;
const publishedDateMoment = moment.utc(publishedDate);
const renewedDateMoment = moment.utc(renewedDate);
const stopCrawlingThisCategory = publishedDateMoment.isSame(
renewedDateMoment,
"minute"
);
if (stopCrawlingThisCategory) {
generatorsToRemove[index] = true;
// console.log("\tGenerator ", index + 1, "has no more new ads");
break;
}
}
} else {
//Generator returned undefined, remove this generator from array
generatorsToRemove[index] = true;
// console.log("Generator ", index + 1, "has no more pages");
}
});
}
await this.sleep(5000);
// console.log("Generators state : ", generatorsToRemove);
for (let i = generatorsToRemove.length - 1; i >= 0; i--) {
if (generatorsToRemove[i]) {
// console.log("\tRemove generator ", i + 1);
indexGenerators.splice(i, 1);
}
}
if (indexGenerators.length === 0) {
done = true;
}
// await this.sleep(this.delayBetweenPages);
}
}
console.log("[OLX] Waiting for async save actions ...");
await Promise.all(asyncSaveActions);
console.log("[OLX] Crawler finished");
return savedRealEstates;
return newRealEstates;
}
async *categoryIndexer(adCategory) {
@@ -191,7 +196,7 @@ class OlxCrawler {
}
async scrapeAd(url) {
console.log("Scraping : ", url);
//console.log("Scraping : ", url);
try {
const adPageSource = await fetch(url);
const body = await adPageSource.text();