From 82533bd3826c5ce22f1750fc2952d86c9d4ef2df Mon Sep 17 00:00:00 2001 From: ahmedsosic Date: Tue, 24 May 2022 18:14:43 +0200 Subject: [PATCH] add atvcrawler and motornesankecrawler --- .../allcrawlers/atvvrawler/atvcrawler.py | 161 ++++++++++++++++ .../motornesankecrawler/mscrawler.py | 177 ++++++++++++++++++ 2 files changed, 338 insertions(+) create mode 100644 kivi_cars/allcrawlers/atvvrawler/atvcrawler.py create mode 100644 kivi_cars/allcrawlers/motornesankecrawler/mscrawler.py diff --git a/kivi_cars/allcrawlers/atvvrawler/atvcrawler.py b/kivi_cars/allcrawlers/atvvrawler/atvcrawler.py new file mode 100644 index 0000000..2854390 --- /dev/null +++ b/kivi_cars/allcrawlers/atvvrawler/atvcrawler.py @@ -0,0 +1,161 @@ +from datetime import date +from bs4 import BeautifulSoup +from urllib import response +from urllib import request +import requests +import pandas as pd + + +user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36' + +headers = {'User-Agent': user_agent} + + + +# Array of object filteri +podaci_db = [] + + +# Pagination cross webpages +# n is number of pages to crawl +pages_number_to_crawl = 2 +for i in range(1,pages_number_to_crawl): + # if kategorija=18 - Automobili + main_website = 'https://www.olx.ba/pretraga?kategorija=2457&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i) + category_of_vehicle = 2457 + +# Request to website +response_for_page = requests.get(main_website, headers=headers) + +# Soup object +soup_page = BeautifulSoup(response_for_page.content, 'html.parser') + + # Results +results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'}) + +# List of olx id +olx_id = [] + +# All filters + +filters = { + "Olx_id" : None, + "Kategorija" : None, + "Cijena" : None, + "Stanje" : None, + "Lokacija_kanton" : None, + "Lokacija_grad" : None, + "Kilometraža" : None, + "Kubikaža (ccm)" : None, + "Godište" : None, + "Vrsta mašine (broj taktova)" : None, + "Transmisija" : None, + "Način hlađenja" : None, + "Datum" : None, + "Vrijeme" : None + } + +# Number of all items +broj_el = 0 + + +# Getting all id's of articles + +for i in range(0, len(results_all_items_per_page)): + if(results_all_items_per_page[i].find('p')): + + + +# Divide id from rest of link + address_content = results_all_items_per_page[i].find('a')['href'] + temp = address_content.split('/') + artikal_number = temp[4] + olx_id.append(artikal_number) + broj_el = broj_el + 1 + +for i in range(0, broj_el): + podaci = filters.copy() + + # Add kategorija + if (category_of_vehicle == 2457): podaci['Kategorija'] = ('ATV/UTV/Quad') + + + +# Artikal olx_link + artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i] + podaci["Olx_id"] = olx_id[i] + response_item = requests.get(artikal_link, headers=headers) + soup_item = BeautifulSoup(response_item.content, 'html.parser') + result_item = soup_item.find('div',{'class':'artikal_lijevo'}) + #print(artikal_link) + + # Osnovni filteri + +# Cijena + if (result_item.find('div',{'id':'pc'})): + x_cijena = result_item.find('div',{'id':'pc'}).findAll('p') + item_cijena = x_cijena[1].get_text().split()[0] + if(item_cijena == 'Po'): + item_cijena = "Po dogovoru" + podaci['Cijena'] = item_cijena + #print(podaci['Cijena']) + + + # Lokacija + #kanton + if (result_item.find('div',{'class':'mobile-lokacija'})): + x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split() + item_kanton = x_lokacija[0].replace(',','') + podaci['Lokacija_kanton'] = item_kanton + #print(podaci['Lokacija_kanton']) + + #grad + x_lokacija.pop(0) + item_grad = x_lokacija + mojstring = ' '.join(item_grad) + podaci['Lokacija_grad'] = mojstring + #print(podaci['Lokacija_grad']) + + + # Stanje + if (result_item.find('div',{'class':'mobile-stanje'})): + x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split() + item_stanje = x_stanje[1] + podaci['Stanje'] = item_stanje + #print(podaci['Stanje']) + + # Dodatna polja + if (result_item.find_all('div',{'id':'dodatnapolja1'})): + dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'}) + for i in range (0,len(dodatnapolja_all_divs)): + df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'}) + for j in range (0,len(df_pom)): + df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text() + if (df_pom[j].find('div',{'class','df2'}).find('i')): + df_pom2 = True + else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text() + podaci[df_pom1] = df_pom2 + #print(df_pom1 + ' : ' + str(df_pom2)) + + # Vrijeme i datum + if (result_item.find('time', {'class' : 'entry-date'})): + date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split() + datum = date_time_div[0] + vrijeme = date_time_div[2] + podaci["Datum"] = datum + podaci["Vrijeme"] = vrijeme + #print(podaci["Datum"], podaci["Vrijeme"]) + + # Insert datas to database + dictionary_copy = podaci.copy() + podaci_db.append(dictionary_copy) + + + + +# ------------- CREATE PANDAS DATAFRAME - DICTIONARY -------------- + +olx_db = pd.DataFrame(podaci_db) # treba biti niz +# print(olx_db) + +olx_db.to_excel('proba-atv.xlsx',index=False) diff --git a/kivi_cars/allcrawlers/motornesankecrawler/mscrawler.py b/kivi_cars/allcrawlers/motornesankecrawler/mscrawler.py new file mode 100644 index 0000000..c3dd5f5 --- /dev/null +++ b/kivi_cars/allcrawlers/motornesankecrawler/mscrawler.py @@ -0,0 +1,177 @@ +from datetime import date +from bs4 import BeautifulSoup +from urllib import response +from urllib import request +import requests +import pandas as pd + + +user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36' + +headers = {'User-Agent': user_agent} + + + +# Array of object filteri +podaci_db = [] + + +# Pagination cross webpages +# n is number of pages to crawl +pages_number_to_crawl = 2 +for i in range(1,pages_number_to_crawl): + # if kategorija=18 - Automobili + main_website = 'https://www.olx.ba/pretraga?kategorija=2127&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i) + category_of_vehicle = 2127 + +# Request to website +response_for_page = requests.get(main_website, headers=headers) + +# Soup object +soup_page = BeautifulSoup(response_for_page.content, 'html.parser') + + # Results +results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'}) + +# List of olx id +olx_id = [] + +# All filters + +filters = { + "Olx_id" : None, + "Kategorija" : None, + "Cijena" : None, + "Stanje" : None, + "Lokacija_kanton" : None, + "Lokacija_grad" : None, + "Model" : None, + "Zapremina/Kubikaža (ccm)" : None, + "Masa (kg)" : None, + "Snaga (kW)" : None, + "Konjskih snaga (KS)" : None, + "Maksimalna brzina (km/h)" : None, + "Visina (mm)" : None, + "Dužina (mm)" : None, + "Širina (mm)" : None, + "Gusjenica (DxŠxV)" : None, + "Raspon skija (mm)" : None, + "Zapremina rezervoara (L)" : None, + "Dužina gusjenice (inch/mm)" : None, + "Godina proizvodnje" : None, + "Mjenjač (Transmisija)" : None, + "Hlađenje" : None, + "Broj cilindara" : None, + "Boja" : None, + "Rikverc (vožnja unazad)" : None, + "Grijači za noge" : None, + "Grijači za ruke" : None, + "Kuka za vuču" : None, + "Datum" : None, + "Vrijeme" : None + } + +# Number of all items +broj_el = 0 + + +# Getting all id's of articles + +for i in range(0, len(results_all_items_per_page)): + if(results_all_items_per_page[i].find('p')): + + + +# Divide id from rest of link + address_content = results_all_items_per_page[i].find('a')['href'] + temp = address_content.split('/') + artikal_number = temp[4] + olx_id.append(artikal_number) + broj_el = broj_el + 1 + +for i in range(0, broj_el): + podaci = filters.copy() + + # Add kategorija + if (category_of_vehicle == 2127): podaci['Kategorija'] = ('Motorne sanke') + + + +# Artikal olx_link + artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i] + podaci["Olx_id"] = olx_id[i] + response_item = requests.get(artikal_link, headers=headers) + soup_item = BeautifulSoup(response_item.content, 'html.parser') + result_item = soup_item.find('div',{'class':'artikal_lijevo'}) + #print(artikal_link) + + # Osnovni filteri + +# Cijena + if (result_item.find('div',{'id':'pc'})): + x_cijena = result_item.find('div',{'id':'pc'}).findAll('p') + item_cijena = x_cijena[1].get_text().split()[0] + if(item_cijena == 'Po'): + item_cijena = "Po dogovoru" + podaci['Cijena'] = item_cijena + #print(podaci['Cijena']) + + + # Lokacija + #kanton + if (result_item.find('div',{'class':'mobile-lokacija'})): + x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split() + item_kanton = x_lokacija[0].replace(',','') + podaci['Lokacija_kanton'] = item_kanton + #print(podaci['Lokacija_kanton']) + + #grad + x_lokacija.pop(0) + item_grad = x_lokacija + mojstring = ' '.join(item_grad) + podaci['Lokacija_grad'] = mojstring + #print(podaci['Lokacija_grad']) + + + # Stanje + if (result_item.find('div',{'class':'mobile-stanje'})): + x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split() + item_stanje = x_stanje[1] + podaci['Stanje'] = item_stanje + #print(podaci['Stanje']) + + # Dodatna polja + if (result_item.find_all('div',{'id':'dodatnapolja1'})): + dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'}) + for i in range (0,len(dodatnapolja_all_divs)): + df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'}) + for j in range (0,len(df_pom)): + df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text() + if (df_pom[j].find('div',{'class','df2'}).find('i')): + df_pom2 = True + else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text() + podaci[df_pom1] = df_pom2 + #print(df_pom1 + ' : ' + str(df_pom2)) + + # Vrijeme i datum + if (result_item.find('time', {'class' : 'entry-date'})): + date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split() + datum = date_time_div[0] + vrijeme = date_time_div[2] + podaci["Datum"] = datum + podaci["Vrijeme"] = vrijeme + # print(podaci["Datum"], podaci["Vrijeme"]) + + # Insert datas to database + dictionary_copy = podaci.copy() + podaci_db.append(dictionary_copy) + + + + +# ------------- CREATE PANDAS DATAFRAME - DICTIONARY -------------- + +olx_db = pd.DataFrame(podaci_db) # treba biti niz +# print(olx_db) + +olx_db.to_excel('proba-motorne-sanke.xlsx',index=False)