Improve olx scrapper #113
@@ -88,14 +88,31 @@ class OlxCrawler {
|
||||
const entries = singlePageResults.entries();
|
||||
|
||||
for (const [index, { value: singlePageResult }] of entries) {
|
||||
if (PRINT_CRAWLER_DEBUG) {
|
||||
console.log("================================");
|
||||
console.log("Category Indexer index : ", index);
|
||||
console.log("\tTotal entries : ", singlePageResult.length)
|
||||
}
|
||||
if (singlePageResult) {
|
||||
const saveResults = await this.saveCrawledResults(singlePageResult);
|
||||
const { newRecords, existingRecords } = saveResults;
|
||||
|
||||
if (PRINT_CRAWLER_DEBUG) {
|
||||
console.log("--------------------------");
|
||||
console.log("\tNew record URLs [", newRecords.length, "] :");
|
||||
|
||||
for(const newRecord of newRecords) {
|
||||
console.log("\t\t",newRecord.url);
|
||||
}
|
||||
|
||||
console.log("\t-------------------------");
|
||||
console.log("\tExisting record URLs [", existingRecords.length, "] :");
|
||||
}
|
||||
|
||||
newRealEstates.push(...newRecords);
|
||||
|
||||
for (const existingRecord of existingRecords) {
|
||||
const { publishedDate, renewedDate } = existingRecord;
|
||||
const { publishedDate, renewedDate, url } = existingRecord;
|
||||
|
||||
const publishedDateMoment = moment.utc(publishedDate);
|
||||
const renewedDateMoment = moment.utc(renewedDate);
|
||||
@@ -105,13 +122,25 @@ class OlxCrawler {
|
||||
"minute"
|
||||
);
|
||||
|
||||
if (PRINT_CRAWLER_DEBUG) {
|
||||
console.log("\t\t", url);
|
||||
console.log("\t\t\tPublished date : ", publishedDate);
|
||||
console.log("\t\t\tRenewed date : ", renewedDate);
|
||||
console.log("\t\t\tIs same (up to minute) : ", stopCrawlingThisCategory);
|
||||
}
|
||||
|
||||
if (stopCrawlingThisCategory && !OLX_FORCE_CRAWL) {
|
||||
generatorsToRemove[index] = true;
|
||||
// console.log("\tGenerator ", index + 1, "has no more new ads");
|
||||
if (PRINT_CRAWLER_DEBUG) {
|
||||
console.log("\t\t\tStopping this category indexer");
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (PRINT_CRAWLER_DEBUG) {
|
||||
console.log("\tNo more entries in this category, stopping!");
|
||||
}
|
||||
//Generator returned undefined, remove this generator from array
|
||||
generatorsToRemove[index] = true;
|
||||
// console.log("Generator ", index + 1, "has no more pages");
|
||||
|
||||
Reference in New Issue
Block a user