stop crawling when existing, not renewed ad is found
This commit is contained in:
@@ -18,18 +18,23 @@ const crawlers = [
|
||||
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES,
|
||||
OLX_CONFIG.OLX_MAX_PAGES,
|
||||
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
|
||||
OLX_CONFIG.OLX_IGNORED_USERNAMES
|
||||
OLX_CONFIG.OLX_IGNORED_USERNAMES,
|
||||
OLX_CONFIG.OLX_DELAY_BETWEEN_PAGES
|
||||
)
|
||||
];
|
||||
|
||||
async function crawlAll() {
|
||||
for (let crawler of crawlers) {
|
||||
try {
|
||||
await crawler.crawl();
|
||||
const newRealEstates = await crawler.crawl();
|
||||
|
||||
console.log("Number of new real estates : ", newRealEstates.length);
|
||||
} catch (e) {
|
||||
console.log("Error crawling. Trying next crawler! ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
crawlAll();
|
||||
(async () => {
|
||||
await crawlAll();
|
||||
})();
|
||||
|
||||
@@ -44,7 +44,8 @@ class OlxCrawler {
|
||||
],
|
||||
maxPages = 1000,
|
||||
maxResultsPerPage = 100,
|
||||
ignoredUsernames = []
|
||||
ignoredUsernames = [],
|
||||
delayBetweenPages = 1000
|
||||
) {
|
||||
this.savers = savers;
|
||||
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
|
||||
@@ -53,14 +54,14 @@ class OlxCrawler {
|
||||
this.maxPages = maxPages;
|
||||
this.maxResultsPerPage = maxResultsPerPage;
|
||||
this.ignoredUsernames = ignoredUsernames;
|
||||
this.delayBetweenPages = delayBetweenPages;
|
||||
}
|
||||
|
||||
async crawl() {
|
||||
console.log("[OLX] Crawler started");
|
||||
const crawlAdCategories = this.crawlerAdCategories;
|
||||
|
||||
const savedRealEstates = [];
|
||||
const asyncSaveActions = [];
|
||||
const newRealEstates = [];
|
||||
|
||||
if (crawlAdCategories) {
|
||||
const indexGenerators = [];
|
||||
@@ -71,58 +72,62 @@ class OlxCrawler {
|
||||
let done = false;
|
||||
while (!done) {
|
||||
const categoryIndexerPromises = [];
|
||||
const generatorsToRemove = [];
|
||||
for (const indexGenerator of indexGenerators) {
|
||||
categoryIndexerPromises.push(indexGenerator.next());
|
||||
generatorsToRemove.push(false);
|
||||
}
|
||||
|
||||
Promise.all(categoryIndexerPromises).then(singlePageResults => {
|
||||
const entries = singlePageResults.entries();
|
||||
for (const [index, { value: singlePageResult }] of entries) {
|
||||
if (singlePageResult) {
|
||||
const savePromise = this.saveCrawledResults(singlePageResult)
|
||||
.then(({ exist, savedRecords }) => {
|
||||
if (exist) {
|
||||
indexGenerators.splice(index, 1);
|
||||
if (indexGenerators.length === 0) {
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
const singlePageResults = await Promise.all(categoryIndexerPromises);
|
||||
const entries = singlePageResults.entries();
|
||||
|
||||
for (const savedRecord of savedRecords) {
|
||||
const { createdAt, updatedAt } = savedRecord;
|
||||
for (const [index, { value: singlePageResult }] of entries) {
|
||||
if (singlePageResult) {
|
||||
const saveResults = await this.saveCrawledResults(singlePageResult);
|
||||
const { newRecords, existingRecords } = saveResults;
|
||||
|
||||
console.log("Comparing ", createdAt, " <> ", updatedAt);
|
||||
newRealEstates.push(...newRecords);
|
||||
|
||||
const createdAtMoment = moment.utc(createdAt);
|
||||
const updatedAtMoment = moment.utc(updatedAt);
|
||||
for (const existingRecord of existingRecords) {
|
||||
const { publishedDate, renewedDate } = existingRecord;
|
||||
|
||||
if (createdAtMoment.isSame(updatedAtMoment, "second")) {
|
||||
console.log("\tEqual !");
|
||||
savedRealEstates.push(savedRecord);
|
||||
}
|
||||
}
|
||||
})
|
||||
.catch(error =>
|
||||
console.log("[POSTGRES Saver] Error saving results : ", error)
|
||||
);
|
||||
asyncSaveActions.push(savePromise);
|
||||
} else {
|
||||
//Generator returned undefined, no more pages
|
||||
indexGenerators.splice(index, 1);
|
||||
if (indexGenerators.length === 0) {
|
||||
done = true;
|
||||
const publishedDateMoment = moment.utc(publishedDate);
|
||||
const renewedDateMoment = moment.utc(renewedDate);
|
||||
|
||||
const stopCrawlingThisCategory = publishedDateMoment.isSame(
|
||||
renewedDateMoment,
|
||||
"minute"
|
||||
);
|
||||
|
||||
if (stopCrawlingThisCategory) {
|
||||
generatorsToRemove[index] = true;
|
||||
// console.log("\tGenerator ", index + 1, "has no more new ads");
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
//Generator returned undefined, remove this generator from array
|
||||
generatorsToRemove[index] = true;
|
||||
// console.log("Generator ", index + 1, "has no more pages");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
await this.sleep(5000);
|
||||
// console.log("Generators state : ", generatorsToRemove);
|
||||
for (let i = generatorsToRemove.length - 1; i >= 0; i--) {
|
||||
if (generatorsToRemove[i]) {
|
||||
// console.log("\tRemove generator ", i + 1);
|
||||
indexGenerators.splice(i, 1);
|
||||
}
|
||||
}
|
||||
if (indexGenerators.length === 0) {
|
||||
done = true;
|
||||
}
|
||||
|
||||
// await this.sleep(this.delayBetweenPages);
|
||||
}
|
||||
}
|
||||
console.log("[OLX] Waiting for async save actions ...");
|
||||
await Promise.all(asyncSaveActions);
|
||||
console.log("[OLX] Crawler finished");
|
||||
return savedRealEstates;
|
||||
return newRealEstates;
|
||||
}
|
||||
|
||||
async *categoryIndexer(adCategory) {
|
||||
@@ -191,7 +196,7 @@ class OlxCrawler {
|
||||
}
|
||||
|
||||
async scrapeAd(url) {
|
||||
console.log("Scraping : ", url);
|
||||
//console.log("Scraping : ", url);
|
||||
try {
|
||||
const adPageSource = await fetch(url);
|
||||
const body = await adPageSource.text();
|
||||
|
||||
Reference in New Issue
Block a user