Login to prostor.ba befoure crawl.
This commit is contained in:
@@ -62,15 +62,16 @@ class ProstorCrawler {
|
||||
|
||||
async crawl() {
|
||||
const crawlAdCategories = this.crawlerAdCategories;
|
||||
//We need session cookie to use login privileges
|
||||
const prostorCookie = await this.getCookies();
|
||||
//New tag to check if crawler loged in
|
||||
const login = await this.loginForScraping(PROSTOR_LOGIN);
|
||||
const login = await this.loginForScraping(PROSTOR_LOGIN, prostorCookie);
|
||||
const newRealEstates = [];
|
||||
//
|
||||
console.log("login before crawl:", login);
|
||||
//Crawl only if login was successful
|
||||
if (crawlAdCategories && login) {
|
||||
const indexGenerators = [];
|
||||
for (const adCategory of crawlAdCategories) {
|
||||
indexGenerators.push(this.categoryIndexer(adCategory));
|
||||
indexGenerators.push(this.categoryIndexer(adCategory, prostorCookie));
|
||||
}
|
||||
|
||||
let done = false;
|
||||
@@ -123,13 +124,14 @@ class ProstorCrawler {
|
||||
return newRealEstates;
|
||||
}
|
||||
|
||||
async *categoryIndexer(adCategory) {
|
||||
async *categoryIndexer(adCategory, prostorCookie) {
|
||||
const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
|
||||
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
|
||||
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
||||
const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`;
|
||||
const listOfAllRealEstates = await this.extractRealEstates(
|
||||
urlPageToCrawl
|
||||
urlPageToCrawl,
|
||||
prostorCookie
|
||||
);
|
||||
|
||||
let elementToStartIndexFrom = 0;
|
||||
@@ -143,7 +145,8 @@ class ProstorCrawler {
|
||||
elementToStartIndexFrom += realEstatesForSinglePage.length;
|
||||
|
||||
const singlePageResults = await this.indexSinglePage(
|
||||
realEstatesForSinglePage
|
||||
realEstatesForSinglePage,
|
||||
prostorCookie
|
||||
);
|
||||
|
||||
const filteredSinglePageResults = singlePageResults.filter(
|
||||
@@ -167,10 +170,10 @@ class ProstorCrawler {
|
||||
}
|
||||
}
|
||||
|
||||
async indexSinglePage(realEstatesList) {
|
||||
async indexSinglePage(realEstatesList, prostorCookie) {
|
||||
const asyncActions = [];
|
||||
for (const realEstate of realEstatesList) {
|
||||
asyncActions.push(this.scrapeAd(realEstate));
|
||||
asyncActions.push(this.scrapeAd(realEstate, prostorCookie));
|
||||
}
|
||||
|
||||
try {
|
||||
@@ -184,7 +187,7 @@ class ProstorCrawler {
|
||||
}
|
||||
}
|
||||
|
||||
async scrapeAd(realEstate) {
|
||||
async scrapeAd(realEstate, prostorCookie) {
|
||||
const { lat, lng, property_name, price, size, link, status } = realEstate;
|
||||
|
||||
//Status information is given already in realestate list
|
||||
@@ -200,7 +203,9 @@ class ProstorCrawler {
|
||||
|
||||
// console.log("[PROSTOR] Scraping : ", url);
|
||||
try {
|
||||
const adPageSource = await fetch(url);
|
||||
const adPageSource = await fetch(url, {
|
||||
headers: { Cookie: prostorCookie }
|
||||
});
|
||||
const body = await adPageSource.text();
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
@@ -422,13 +427,15 @@ class ProstorCrawler {
|
||||
}
|
||||
}
|
||||
|
||||
async extractRealEstates(url) {
|
||||
async extractRealEstates(url, prostorCookie) {
|
||||
if (PRINT_CRAWLER_DEBUG) {
|
||||
console.log("[PROSTOR] Index page : ", url);
|
||||
}
|
||||
|
||||
try {
|
||||
const res = await fetch(url);
|
||||
const res = await fetch(url, {
|
||||
headers: { Cookie: prostorCookie }
|
||||
});
|
||||
const body = await res.text();
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
@@ -585,28 +592,21 @@ class ProstorCrawler {
|
||||
return savers[0].save(results);
|
||||
//so that we can use some sequelize options and information when data is inserted
|
||||
}
|
||||
async loginForScraping(PROSTOR_LOGIN) {
|
||||
console.log("PROSTOR_LOGIN", PROSTOR_LOGIN);
|
||||
const prostorCookie = await this.getCookies();
|
||||
console.log("prostor cookie", prostorCookie);
|
||||
async loginForScraping(PROSTOR_LOGIN, prostorCookie) {
|
||||
let formData = new FormData();
|
||||
formData.append("email", PROSTOR_LOGIN.EMAIL);
|
||||
formData.append("password", PROSTOR_LOGIN.PASSWORD);
|
||||
//When once loged in it stays loged in with same credentials.
|
||||
//Do we need to log out ??
|
||||
|
||||
return fetch("https://prostor.ba/moj-prostor/prijava", {
|
||||
method: "POST",
|
||||
body: formData,
|
||||
headers: { Cookie: prostorCookie }
|
||||
})
|
||||
.then(page => {
|
||||
//
|
||||
console.log("headers: ", page.headers.raw()["set-cookie"]);
|
||||
return page.text();
|
||||
})
|
||||
.then(resp => {
|
||||
const $ = cheerio.load(resp);
|
||||
console.log("$ ", $("h1").text());
|
||||
if (
|
||||
$("h1")
|
||||
.text()
|
||||
|
||||
Reference in New Issue
Block a user