diff --git a/kivi_cars/allcrawlers/autocrawler/crawleritem.py b/kivi_cars/allcrawlers/autocrawler/crawleritem.py new file mode 100644 index 0000000..a5e2694 --- /dev/null +++ b/kivi_cars/allcrawlers/autocrawler/crawleritem.py @@ -0,0 +1,323 @@ +# ----------Imports------------ +from datetime import date +from traceback import print_tb +from unittest import result +from urllib import response +from urllib.request import Request +from warnings import filters +from xml.etree.ElementTree import tostring +from bs4 import BeautifulSoup +from matplotlib import dates +from numpy import diag_indices +import requests +import pandas as pd +import random + +# List of User-Agent +user_agent_list = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36', + 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1', + 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363', +] + +user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)] +headers = {'User-Agent': user_agent} + +# Array of object filteri + +podaci_db = [] + +# Pagination cross webpages +# n is number of pages to crawl +pages_number_to_crawl = 2 +for i in range(1,pages_number_to_crawl): + # if kategorija=18 - Automobili + main_website = 'https://www.olx.ba/pretraga?id=18&kategorija=18&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&kilometra-a_min=0&kilometra-a_max=0&stranica=' + str(i) + category_of_vehicle = 18 + + # Request to website + response_for_page = requests.get(main_website, headers=headers) + + # Soup object + soup_page = BeautifulSoup(response_for_page.content, 'html.parser') + + # Results + results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'}) + + olx_id = [] + + # All filters + filters = { + "Olx_id" : None, + "Kategorija" : None, + "Cijena" : None, + "Stanje" : None, + "Lokacija_kanton" : None, + "Lokacija_grad" : None, + "Proizvođač" : None, + "Model" : None, + "Godište" : None, + "Kilometraža" : None, + "Kilovata (KW)" : None, + "Kubikaža" : None, + "Gorivo" : None, + "Broj vrata" : None, + "Konjskih snaga" : None, + "Metalik" : None, + "Masa/Težina (kg)" : None, + "Tip" : None, + "Pogon" : None, + "Emisioni standard" : None, + "Veličina felgi" : None, + "Transmisija" : None, + "Broj stepeni prijenosa" : None, + "Boja" : None, + "Muzika / ozvučenje" : None, + "Parking senzori" : None, + "Parking kamera" : None, + "Registrovan do" : None, + "Godina prve registracije" : None, + "Broj prethodnih vlasnika" : None, + "Posjeduje gume" : None, + "Višezonska klima" : None, + "Rolo zavjese" : None, + "Svjetla" : None, + "Zaštita/Blokada" : None, + "Sjedećih mjesta" : None, + "Turbo" : None, + "DPF/FAP filter" : None, + "Strane tablice" : None, + "Ocarinjen" : None, + "Prilagođen invalidima" : None, + "Servo volan" : None, + "Tempomat" : None, + "ESP" : None, + "El. podizači stakala" : None, + "Senzor mrtvog ugla" : None, + "Digitalna klima" : None, + "Touch screen (ekran)" : None, + "Panorama krov" : None, + "Koža" : None, + "Masaža sjedišta" : None, + "El. pomjeranje sjedišta" : None, + "Senzor auto. svjetla" : None, + "Alarm" : None, + "Daljinsko otključavanje" : None, + "Auto kuka" : None, + "Udaren" : None, + "Start-Stop sistem" : None, + "Park assist" : None, + "Registrovan" : None, + "Na lizingu" : None, + "Servisna knjiga" : None, + "Komande na volanu" : None, + "ABS" : None, + "Airbag" : None, + "Električni retrovizori" : None, + "Klima" : None, + "Navigacija" : None, + "Šiber" : None, + "Naslon za ruku" : None, + "Hlađenje sjedišta" : None, + "Grijanje sjedišta" : None, + "Memorija sjedišta" : None, + "Alu felge" : None, + "Centralna brava" : None, + "Oldtimer" : None, + "ISOFIX" : None, + "Datum" : None, + "Vrijeme" : None + } + + # Number of all items + broj_el = 0 + + # Getting all id's of articles + for i in range(0, len(results_all_items_per_page)): + if(results_all_items_per_page[i].find('p')): + # Divide id from rest of link + address_content = results_all_items_per_page[i].find('a')['href'] + temp = address_content.split('/') + artikal_number = temp[4] + olx_id.append(artikal_number) + broj_el = broj_el + 1 + + for i in range(0, broj_el): + + # New dictionary instance for every item + podaci = filters.copy() + + # Add kategorija + if (category_of_vehicle == 18): podaci['Kategorija'] = ('Automobili') + + + for i in range(0, 1): #len(olx_id) + # Artikal olx_link + # artikal_link = 'https://www.olx.ba/artikal/' + '35731053' + artikal_link = 'https://www.olx.ba/artikal/' + '36976713' + + + podaci["Olx_id"] = artikal_link + response_item = requests.get(artikal_link, headers=headers) + soup_item = BeautifulSoup(response_item.content, 'html.parser') + result_item = soup_item.find('div',{'class':'artikal_lijevo'}) + print(artikal_link) + + + # Getting filters info from item + + # Osnovni filteri + + # Cijena + if (result_item.find('div',{'id':'pc'})): + x_cijena = result_item.find('div',{'id':'pc'}).findAll('p') + item_cijena = x_cijena[1].get_text().split()[0] + if(item_cijena == 'Po'): + item_cijena = "Po dogovoru" + podaci['Cijena'] = item_cijena + print(podaci['Cijena']) + + # Lokacija + #kanton + if (result_item.find('div',{'class':'mobile-lokacija'})): + x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split() + item_kanton = x_lokacija[0].replace(',','') + podaci['Lokacija_kanton'] = item_kanton + print(podaci['Lokacija_kanton']) + + #grad + x_lokacija.pop(0) + item_grad = x_lokacija + mojstring = ' '.join(item_grad) + podaci['Lokacija_grad'] = mojstring + print(podaci['Lokacija_grad']) + + # Stanje + if (result_item.find('div',{'class':'mobile-stanje'})): + x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split() + item_stanje = x_stanje[1] + podaci['Stanje'] = item_stanje + print(podaci['Stanje']) + + # Ispod divs + if (result_item.find_all('div',{'class':'ispod'})): + ispod_divs = result_item.find_all('div',{'class':'ispod'}) + for i in range (len(ispod_divs)): + + # Proizvodjac + if (i==0): + x_proizvodjac_p = ispod_divs[0].find_all('p') + x_proizvodjac = x_proizvodjac_p[1].get_text() + podaci['Proizvođač'] = x_proizvodjac + print(podaci['Proizvođač'] ) + + # Model + if (i==1): + x_model_p = ispod_divs[1].find_all('p') + x_model = x_model_p[1].get_text() + podaci['Model'] = x_model + print(podaci['Model']) + + # Godiste + if (i==2): + x_godiste_p = ispod_divs[2].find_all('p') + x_godiste = x_godiste_p[1].get_text() + podaci['Godište'] = x_godiste + print(podaci['Godište']) + + # Kilometraza + if (i==3): + x_kilometraza_p = ispod_divs[3].find_all('p') + x_kilometraza = x_kilometraza_p[1].get_text() + podaci['Kilometraža'] = x_kilometraza + print(podaci['Kilometraža']) + + # Gorivo + if (i==4): + x_gorivo_p = ispod_divs[4].find_all('p') + x_gorivo = x_gorivo_p[1].get_text() + podaci['Gorivo'] = x_gorivo + print(podaci['Gorivo']) + + + # Napredni filteri + + # Dodatna polja + if (result_item.find_all('div',{'id':'dodatnapolja1'})): + dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'}) + for i in range (0,len(dodatnapolja_all_divs)): + df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'}) + for j in range (0,len(df_pom)): + df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text() + if (df_pom[j].find('div',{'class','df2'}).find('i')): + df_pom2 = True + else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text() + podaci[df_pom1] = df_pom2 + + # KW + + kw = podaci['Kilovata (KW)'].split()[0] + # print(kw) + + + + + # Vrijeme i datum + if (result_item.find('time', {'class' : 'entry-date'})): + date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split() + datum = date_time_div[0] + vrijeme = date_time_div[2] + podaci["Datum"] = datum + podaci["Vrijeme"] = vrijeme + + # print(podaci["Datum"], podaci["Vrijeme"]) + + + # Vrsta oglasa + + + dictionary_copy = podaci.copy() + podaci_db.append(dictionary_copy) + + + + +# ------------- CREATE PANDAS DATAFRAME - DICTIONARY -------------- + +# olx_db = pd.DataFrame(podaci_db) # treba biti niz +# # print(olx_db) + +# olx_db.to_excel('proba2.xlsx',index=False) + + + + + + + + +# ----------------------- End of Code ------------------------- + + + # print(len(podaci_db)) + # print(podaci) + # print(podaci.keys()) + # print(broj_el) + + + + + + + + + + + + + + + + diff --git a/kivi_cars/allcrawlers/autocrawler/olxautocrawler.py b/kivi_cars/allcrawlers/autocrawler/olxautocrawler.py new file mode 100644 index 0000000..1583929 --- /dev/null +++ b/kivi_cars/allcrawlers/autocrawler/olxautocrawler.py @@ -0,0 +1,304 @@ +# ----------Imports------------ +from datetime import date +from traceback import print_tb +from unittest import result +from urllib import response +from urllib.request import Request +from bs4 import BeautifulSoup +from matplotlib import dates +from numpy import diag_indices +import requests +import pandas as pd +import random + +# List of User-Agent +user_agent_list = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36', + 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1', + 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363', +] + +user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)] +headers = {'User-Agent': user_agent} + +# Array of object filteri + +podaci_db = [] + +# Pagination cross webpages +# n is number of pages to crawl +pages_number_to_crawl = 2 +for i in range(1,pages_number_to_crawl): + # if kategorija=18 - Automobili + main_website = 'https://www.olx.ba/pretraga?id=18&kategorija=18&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&kilometra-a_min=0&kilometra-a_max=0&stranica=' + str(i) + category_of_vehicle = 18 + + # Request to website + response_for_page = requests.get(main_website, headers=headers) + + # Soup object + soup_page = BeautifulSoup(response_for_page.content, 'html.parser') + + # Results + results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'}) + + # List of olx id + olx_id = [] + + # All filters + filters = { + "Olx_id" : None, + "Kategorija" : None, + "Cijena" : None, + "Stanje" : None, + "Lokacija_kanton" : None, + "Lokacija_grad" : None, + "Proizvođač" : None, + "Model" : None, + "Godište" : None, + "Kilometraža" : None, + "Kilovata (KW)" : None, + "Kubikaža" : None, + "Gorivo" : None, + "Broj vrata" : None, + "Konjskih snaga" : None, + "Metalik" : None, + "Masa/Težina (kg)" : None, + "Tip" : None, + "Pogon" : None, + "Emisioni standard" : None, + "Veličina felgi" : None, + "Transmisija" : None, + "Broj stepeni prijenosa" : None, + "Boja" : None, + "Muzika / ozvučenje" : None, + "Parking senzori" : None, + "Parking kamera" : None, + "Registrovan do" : None, + "Godina prve registracije" : None, + "Broj prethodnih vlasnika" : None, + "Posjeduje gume" : None, + "Višezonska klima" : None, + "Rolo zavjese" : None, + "Svjetla" : None, + "Zaštita/Blokada" : None, + "Sjedećih mjesta" : None, + "Turbo" : None, + "DPF/FAP filter" : None, + "Strane tablice" : None, + "Ocarinjen" : None, + "Prilagođen invalidima" : None, + "Servo volan" : None, + "Tempomat" : None, + "ESP" : None, + "El. podizači stakala" : None, + "Senzor mrtvog ugla" : None, + "Digitalna klima" : None, + "Touch screen (ekran)" : None, + "Panorama krov" : None, + "Koža" : None, + "Masaža sjedišta" : None, + "El. pomjeranje sjedišta" : None, + "Senzor auto. svjetla" : None, + "Alarm" : None, + "Daljinsko otključavanje" : None, + "Auto kuka" : None, + "Udaren" : None, + "Start-Stop sistem" : None, + "Park assist" : None, + "Registrovan" : None, + "Na lizingu" : None, + "Servisna knjiga" : None, + "Komande na volanu" : None, + "ABS" : None, + "Airbag" : None, + "Električni retrovizori" : None, + "Klima" : None, + "Navigacija" : None, + "Šiber" : None, + "Naslon za ruku" : None, + "Hlađenje sjedišta" : None, + "Grijanje sjedišta" : None, + "Memorija sjedišta" : None, + "Alu felge" : None, + "Centralna brava" : None, + "Oldtimer" : None, + "ISOFIX" : None, + "Datum" : None, + "Vrijeme" : None + } + + # Number of all items + broj_el = 0 + + ## Getting all id's of articles + for i in range(0, len(results_all_items_per_page)): + if(results_all_items_per_page[i].find('p')): + # Divide id from rest of link + address_content = results_all_items_per_page[i].find('a')['href'] + temp = address_content.split('/') + artikal_number = temp[4] + olx_id.append(artikal_number) + broj_el = broj_el + 1 + + for i in range(0, broj_el): + + # New dictionary instance for every item + podaci = filters.copy() + + # Add kategorija + if (category_of_vehicle == 18): podaci['Kategorija'] = ('Automobili') + + # Artikal olx_link + artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i] + podaci["Olx_id"] = olx_id[i] + response_item = requests.get(artikal_link, headers=headers) + soup_item = BeautifulSoup(response_item.content, 'html.parser') + result_item = soup_item.find('div',{'class':'artikal_lijevo'}) + # print(artikal_link) + + + # Getting filters info from item + + # Osnovni filteri + + # Cijena + if (result_item.find('div',{'id':'pc'})): + x_cijena = result_item.find('div',{'id':'pc'}).findAll('p') + item_cijena = x_cijena[1].get_text().split()[0] + if(item_cijena == 'Po'): + item_cijena = "Po dogovoru" + podaci['Cijena'] = item_cijena + # print(podaci['Cijena']) + + # Lokacija + #kanton + if (result_item.find('div',{'class':'mobile-lokacija'})): + x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split() + item_kanton = x_lokacija[0].replace(',','') + podaci['Lokacija_kanton'] = item_kanton + # print(podaci['Lokacija_kanton']) + + #grad + x_lokacija.pop(0) + item_grad = x_lokacija + mojstring = ' '.join(item_grad) + podaci['Lokacija_grad'] = mojstring + # print(podaci['Lokacija_grad']) + + # Stanje + if (result_item.find('div',{'class':'mobile-stanje'})): + x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split() + item_stanje = x_stanje[1] + podaci['Stanje'] = item_stanje + # print(podaci['Stanje']) + + # Ispod divs + if (result_item.find_all('div',{'class':'ispod'})): + ispod_divs = result_item.find_all('div',{'class':'ispod'}) + for i in range (len(ispod_divs)): + + # Proizvodjac + if (i==0): + x_proizvodjac_p = ispod_divs[0].find_all('p') + x_proizvodjac = x_proizvodjac_p[1].get_text() + podaci['Proizvođač'] = x_proizvodjac + # print(podaci['Proizvođač']) + + # Model + if (i==1): + x_model_p = ispod_divs[1].find_all('p') + x_model = x_model_p[1].get_text() + podaci['Model'] = x_model + # print(podaci['Model']) + + # Godiste + if (i==2): + x_godiste_p = ispod_divs[2].find_all('p') + x_godiste = x_godiste_p[1].get_text() + podaci['Godište'] = x_godiste + # print(podaci['Godište']) + + # Kilometraza + if (i==3): + x_kilometraza_p = ispod_divs[3].find_all('p') + x_kilometraza = x_kilometraza_p[1].get_text() + podaci['Kilometraža'] = x_kilometraza + # print(podaci['Kilometraža']) + + # Gorivo + if (i==4): + x_gorivo_p = ispod_divs[4].find_all('p') + x_gorivo = x_gorivo_p[1].get_text() + podaci['Gorivo'] = x_gorivo + # print(podaci['Gorivo']) + + + # Napredni filteri + + # Dodatna polja + if (result_item.find_all('div',{'id':'dodatnapolja1'})): + dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'}) + for i in range (0,len(dodatnapolja_all_divs)): + df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'}) + for j in range (0,len(df_pom)): + df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text() + if (df_pom[j].find('div',{'class','df2'}).find('i')): + df_pom2 = True + else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text() + podaci[df_pom1] = df_pom2 + + # KW single + kw = podaci['Kilovata (KW)'].split()[0] + podaci['Kilovata (KW)'] = kw + # print(kw) + + + + # Vrijeme i datum + if (result_item.find('time', {'class' : 'entry-date'})): + date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split() + datum = date_time_div[0] + vrijeme = date_time_div[2] + podaci["Datum"] = datum + podaci["Vrijeme"] = vrijeme + # print(podaci["Datum"], podaci["Vrijeme"]) + + + + + # Insert datas to database + dictionary_copy = podaci.copy() + podaci_db.append(dictionary_copy) + + + + +# ------------- CREATE PANDAS DATAFRAME - DICTIONARY -------------- + +olx_db = pd.DataFrame(podaci_db) # treba biti niz +# print(olx_db) + +olx_db.to_excel('proba2.xlsx',index=False) + +print("Zavrseno!!") + + + + + + + + + + + + + + + + + + diff --git a/kivi_cars/allcrawlers/autocrawler/test_auto.py b/kivi_cars/allcrawlers/autocrawler/test_auto.py new file mode 100644 index 0000000..bec1b61 --- /dev/null +++ b/kivi_cars/allcrawlers/autocrawler/test_auto.py @@ -0,0 +1,313 @@ +# ----------Imports------------ +from datetime import date +from traceback import print_tb +from unittest import result +from urllib import response +from urllib.request import Request +from bs4 import BeautifulSoup +from matplotlib import dates +from numpy import diag_indices +import requests +import pandas as pd +import random + +# List of User-Agent +user_agent_list = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36', + 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1', + 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363', +] + +user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)] +headers = {'User-Agent': user_agent} + +# Vrsta pretraga (vrsta=*): + # Sve pretrage link: https://www.olx.ba/pretraga?id=18&kategorija=18&sort_order=desc&sort_po=datum + # Samo prodaja link: https://www.olx.ba/pretraga?id=18&kategorija=18&sort_order=desc&sort_po=datum&vrsta=samoprodaja + # Samo potraznja link: https://www.olx.ba/pretraga?id=18&kategorija=18&sort_order=desc&sort_po=datum&vrsta=samopotraznja + +# Sa zamjenom (sazamjenom=sazamjenom) + #https://www.olx.ba/pretraga?id=18&kategorija=18&stanje=0&sort_order=desc&sort_po=datum&sazamjenom=sazamjenom + + +# Array of object filteri + +podaci_db = [] + +# Pagination cross webpages +# n is number of pages to crawl +pages_number_to_crawl = 2 +for i in range(1,pages_number_to_crawl): + # if kategorija=18 - Automobili + main_website = 'https://www.olx.ba/pretraga?id=18&kategorija=18&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&kilometra-a_min=0&kilometra-a_max=0&stranica=' + str(i) + category_of_vehicle = 18 + + # Request to website + response_for_page = requests.get(main_website, headers=headers) + + # Soup object + soup_page = BeautifulSoup(response_for_page.content, 'html.parser') + + # Results + results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'}) + + # List of olx id + olx_id = [] + + # All filters + filters = { + "Olx_id" : None, + "Vrsta_oglasa" : None, + "Kategorija" : None, + "Cijena" : None, + "Stanje" : None, + "Lokacija_kanton" : None, + "Lokacija_grad" : None, + "Proizvođač" : None, + "Model" : None, + "Godište" : None, + "Kilometraža" : None, + "Kilovata (KW)" : None, + "Kubikaža" : None, + "Gorivo" : None, + "Broj vrata" : None, + "Konjskih snaga" : None, + "Metalik" : None, + "Masa/Težina (kg)" : None, + "Tip" : None, + "Pogon" : None, + "Emisioni standard" : None, + "Veličina felgi" : None, + "Transmisija" : None, + "Broj stepeni prijenosa" : None, + "Boja" : None, + "Muzika / ozvučenje" : None, + "Parking senzori" : None, + "Parking kamera" : None, + "Registrovan do" : None, + "Godina prve registracije" : None, + "Broj prethodnih vlasnika" : None, + "Posjeduje gume" : None, + "Višezonska klima" : None, + "Rolo zavjese" : None, + "Svjetla" : None, + "Zaštita/Blokada" : None, + "Sjedećih mjesta" : None, + "Turbo" : None, + "DPF/FAP filter" : None, + "Strane tablice" : None, + "Ocarinjen" : None, + "Prilagođen invalidima" : None, + "Servo volan" : None, + "Tempomat" : None, + "ESP" : None, + "El. podizači stakala" : None, + "Senzor mrtvog ugla" : None, + "Digitalna klima" : None, + "Touch screen (ekran)" : None, + "Panorama krov" : None, + "Koža" : None, + "Masaža sjedišta" : None, + "El. pomjeranje sjedišta" : None, + "Senzor auto. svjetla" : None, + "Alarm" : None, + "Daljinsko otključavanje" : None, + "Auto kuka" : None, + "Udaren" : None, + "Start-Stop sistem" : None, + "Park assist" : None, + "Registrovan" : None, + "Na lizingu" : None, + "Servisna knjiga" : None, + "Komande na volanu" : None, + "ABS" : None, + "Airbag" : None, + "Električni retrovizori" : None, + "Klima" : None, + "Navigacija" : None, + "Šiber" : None, + "Naslon za ruku" : None, + "Hlađenje sjedišta" : None, + "Grijanje sjedišta" : None, + "Memorija sjedišta" : None, + "Alu felge" : None, + "Centralna brava" : None, + "Oldtimer" : None, + "ISOFIX" : None, + "Datum" : None, + "Vrijeme" : None + } + + # Number of all items + broj_el = 0 + + # Getting all id's of articles + for i in range(0, len(results_all_items_per_page)): + if(results_all_items_per_page[i].find('p')): + # Divide id from rest of link + address_content = results_all_items_per_page[i].find('a')['href'] + temp = address_content.split('/') + artikal_number = temp[4] + olx_id.append(artikal_number) + broj_el = broj_el + 1 + + for i in range(0, broj_el): + + # New dictionary instance for every item + podaci = filters.copy() + + # Add kategorija + if (category_of_vehicle == 884): podaci['Kategorija'] = ('Prikolice') + + # Artikal olx_link + artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i] + podaci["Olx_id"] = olx_id[i] + response_item = requests.get(artikal_link, headers=headers) + soup_item = BeautifulSoup(response_item.content, 'html.parser') + result_item = soup_item.find('div',{'class':'artikal_lijevo'}) + # print(artikal_link) + + + # Getting filters info from item + + # Osnovni filteri + + # Cijena + if (result_item.find('div',{'id':'pc'})): + x_cijena = result_item.find('div',{'id':'pc'}).findAll('p') + item_cijena = x_cijena[1].get_text().split()[0] + if(item_cijena == 'Po'): + item_cijena = "Po dogovoru" + podaci['Cijena'] = item_cijena + print(podaci['Cijena']) + + # Lokacija + #kanton + if (result_item.find('div',{'class':'mobile-lokacija'})): + x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split() + item_kanton = x_lokacija[0].replace(',','') + podaci['Lokacija_kanton'] = item_kanton + print(podaci['Lokacija_kanton']) + + #grad + x_lokacija.pop(0) + item_grad = x_lokacija + mojstring = ' '.join(item_grad) + podaci['Lokacija_grad'] = mojstring + print(podaci['Lokacija_grad']) + + # Stanje + if (result_item.find('div',{'class':'mobile-stanje'})): + x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split() + item_stanje = x_stanje[1] + podaci['Stanje'] = item_stanje + print(podaci['Stanje']) + + # Ispod divs + if (result_item.find_all('div',{'class':'ispod'})): + ispod_divs = result_item.find_all('div',{'class':'ispod'}) + for i in range (len(ispod_divs)): + + # Proizvodjac + if (i==0): + x_proizvodjac_p = ispod_divs[0].find_all('p') + x_proizvodjac = x_proizvodjac_p[1].get_text() + podaci['Proizvođač'] = x_proizvodjac + print(podaci['Proizvođač'] ) + + # Model + if (i==1): + x_model_p = ispod_divs[1].find_all('p') + x_model = x_model_p[1].get_text() + podaci['Model'] = x_model + print(podaci['Model']) + + # Godiste + if (i==2): + x_godiste_p = ispod_divs[2].find_all('p') + x_godiste = x_godiste_p[1].get_text() + podaci['Godište'] = x_godiste + print(podaci['Godište']) + + # Kilometraza + if (i==3): + x_kilometraza_p = ispod_divs[3].find_all('p') + x_kilometraza = x_kilometraza_p[1].get_text() + podaci['Kilometraža'] = x_kilometraza + print(podaci['Kilometraža']) + + # Gorivo + if (i==4): + x_gorivo_p = ispod_divs[4].find_all('p') + x_gorivo = x_gorivo_p[1].get_text() + podaci['Gorivo'] = x_gorivo + print(podaci['Gorivo']) + + + + # Napredni filteri + + # Dodatna polja + if (result_item.find_all('div',{'id':'dodatnapolja1'})): + dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'}) + for i in range (0,len(dodatnapolja_all_divs)): + df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'}) + for j in range (0,len(df_pom)): + df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text() + if (df_pom[j].find('div',{'class','df2'}).find('i')): + df_pom2 = True + else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text() + podaci[df_pom1] = df_pom2 + + # KW single + kw = podaci['Kilovata (KW)'].split()[0] + podaci['Kilovata (KW)'] = kw + print(kw) + + + + # Vrijeme i datum + if (result_item.find('time', {'class' : 'entry-date'})): + date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split() + datum = date_time_div[0] + vrijeme = date_time_div[2] + podaci["Datum"] = datum + podaci["Vrijeme"] = vrijeme + # print(podaci["Datum"], podaci["Vrijeme"]) + + # Insert datas to database + dictionary_copy = podaci.copy() + podaci_db.append(dictionary_copy) + + print('--------------------------------------------------------------------') + + + + + +# ------------- CREATE PANDAS DATAFRAME - DICTIONARY -------------- + +# olx_db = pd.DataFrame(podaci_db) # treba biti niz +# # print(olx_db) + +# olx_db.to_excel('probatest1.xlsx',index=False) + + + + + + + + + + + + + + + + + + diff --git a/kivi_cars/allcrawlers/biciklocrawler/olxbiciklocrawler.py b/kivi_cars/allcrawlers/biciklocrawler/olxbiciklocrawler.py new file mode 100644 index 0000000..d0944f5 --- /dev/null +++ b/kivi_cars/allcrawlers/biciklocrawler/olxbiciklocrawler.py @@ -0,0 +1,177 @@ +from datetime import date +from bs4 import BeautifulSoup +from urllib import response +from urllib import request +import requests +import pandas as pd + + +user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36' + +headers = {'User-Agent': user_agent} + + + +# Array of object filteri +podaci_db = [] + + +# Pagination cross webpages +# n is number of pages to crawl +pages_number_to_crawl = 2 +for i in range(1,pages_number_to_crawl): + # if kategorija=18 - Automobili + main_website = 'https://www.olx.ba/pretraga?kategorija=22&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i) + category_of_vehicle = 22 + +# Request to website +response_for_page = requests.get(main_website, headers=headers) + +# Soup object +soup_page = BeautifulSoup(response_for_page.content, 'html.parser') + + # Results +results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'}) + +# List of olx id +olx_id = [] + +# All filters + +filters = { + "Olx_id" : None, + "Kategorija" : None, + "Cijena" : None, + "Stanje" : None, + "Lokacija_kanton" : None, + "Lokacija_grad" : None, + "Brend" : None, + "Broj brzina" : None, + "Tip" : None, + "Spol" : None, + "Masa" : None, + "Veličina rama" : None, + "Veličina točkova (inch)" : None, + "Godište" : None, + "Model" : None, + "Dječije" : None, + "Zadnji amortizer" : None, + "Disk kočnice" : None, + "Amortizer na sicu" : None, + "Svjetla/Signalizacija" : None, + "Gepek" : None, + "Prednji amortizer" : None, + "Nosač za vodu" : None, + "Datum" : None, + "Vrijeme" : None + } + +# Number of all items +broj_el = 0 + + + +# Getting all id's of articles + +for i in range(0, len(results_all_items_per_page)): + if(results_all_items_per_page[i].find('p')): + + + +# Divide id from rest of link + address_content = results_all_items_per_page[i].find('a')['href'] + temp = address_content.split('/') + artikal_number = temp[4] + olx_id.append(artikal_number) + broj_el = broj_el + 1 + +for i in range(0, broj_el): + podaci = filters.copy() + + # Add kategorija + if (category_of_vehicle == 22): podaci['Kategorija'] = ('Bicikli') + +# Artikal olx_link + artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i] + podaci["Olx_id"] = olx_id[i] + response_item = requests.get(artikal_link, headers=headers) + soup_item = BeautifulSoup(response_item.content, 'html.parser') + result_item = soup_item.find('div',{'class':'artikal_lijevo'}) + # print(artikal_link) + + # Osnovni filteri + +# Cijena + if (result_item.find('div',{'id':'pc'})): + x_cijena = result_item.find('div',{'id':'pc'}).findAll('p') + item_cijena = x_cijena[1].get_text().split()[0] + if(item_cijena == 'Po'): + item_cijena = "Po dogovoru" + podaci['Cijena'] = item_cijena + # print(podaci['Cijena']) + + + # Lokacija + #kanton + if (result_item.find('div',{'class':'mobile-lokacija'})): + x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split() + item_kanton = x_lokacija[0].replace(',','') + podaci['Lokacija_kanton'] = item_kanton + # print(podaci['Lokacija_kanton']) + + #grad + x_lokacija.pop(0) + item_grad = x_lokacija + mojstring = ' '.join(item_grad) + podaci['Lokacija_grad'] = mojstring + # print(podaci['Lokacija_grad']) + + # Brand + + if (result_item.find('div',{'itemprop':'brand'})): + x_brend = result_item.find('div',{'itemprop':'brand'}).find('a').get_text() + podaci['Brend'] = x_brend + # print(podaci['Brend']) + +# Stanje + if (result_item.find('div',{'class':'mobile-stanje'})): + x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split() + item_stanje = x_stanje[1] + podaci['Stanje'] = item_stanje + # print(podaci['Stanje']) + + # Dodatna polja + if (result_item.find_all('div',{'id':'dodatnapolja1'})): + dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'}) + for i in range (0,len(dodatnapolja_all_divs)): + df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'}) + for j in range (0,len(df_pom)): + df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text() + if (df_pom[j].find('div',{'class','df2'}).find('i')): + df_pom2 = True + else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text() + podaci[df_pom1] = df_pom2 + #print(df_pom1 + ' : ' + str(df_pom2)) + + # Vrijeme i datum + if (result_item.find('time', {'class' : 'entry-date'})): + date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split() + datum = date_time_div[0] + vrijeme = date_time_div[2] + podaci["Datum"] = datum + podaci["Vrijeme"] = vrijeme + # print(podaci["Datum"], podaci["Vrijeme"]) + + # Insert datas to database + dictionary_copy = podaci.copy() + podaci_db.append(dictionary_copy) + + + + +# ------------- CREATE PANDAS DATAFRAME - DICTIONARY -------------- + +olx_db = pd.DataFrame(podaci_db) # treba biti niz +# print(olx_db) + +olx_db.to_excel('proba-bicikli.xlsx',index=False) diff --git a/kivi_cars/allcrawlers/busevicrawler/busitem.py b/kivi_cars/allcrawlers/busevicrawler/busitem.py new file mode 100644 index 0000000..0549860 --- /dev/null +++ b/kivi_cars/allcrawlers/busevicrawler/busitem.py @@ -0,0 +1,185 @@ +from datetime import date +from bs4 import BeautifulSoup +from urllib import response +from urllib import request +import requests +import pandas as pd + + +user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36' + +headers = {'User-Agent': user_agent} + + + +# Array of object filteri +podaci_db = [] + +# Pagination cross webpages +# n is number of pages to crawl +pages_number_to_crawl = 4 +for i in range(1,pages_number_to_crawl): + # if kategorija=18 - Automobili + main_website = 'https://www.olx.ba/pretraga?kategorija=1040&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i) + category_of_vehicle = 21 + +# Request to website +response_for_page = requests.get(main_website, headers=headers) + +# Soup object +soup_page = BeautifulSoup(response_for_page.content, 'html.parser') + + + + # Results +results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'}) + +# List of olx id +olx_id = [] + +# All filters + +filters = { + "Olx_id" : None, + "Kategorija" : None, + "Cijena" : None, + "Stanje" : None, + "Lokacija_kanton" : None, + "Lokacija_grad" : None, + "Brend" : None, + "Kilometraža" : None, + "Gorivo" : None, + "Tip" : None, + "Godište" : None, + "Kilovata (KW)" : None, + "Konjskih snaga" : None, + "Kubikaža" : None, + "Boja" : None, + "Muzika / ozvučenje" : None, + "Transmisija" : None, + "Model" : None, + "El. podizači stakala" : None, + "Električni retrovizori" : None, + "Klima" : None, + "Metalik" : None, + "Navigacija" : None, + "Ocarinjen" : None, + "Registrovan" : None, + "Servisna knjiga" : None, + "Udaren" : None, + "Xenon svjetla" : None, + "Datum" : None, + "Vrijeme" : None + } + + +# Number of all items +broj_el = 0 + +# Getting all id's of articles + +for i in range(0, len(results_all_items_per_page)): + if(results_all_items_per_page[i].find('p')): + + +# Divide id from rest of link + address_content = results_all_items_per_page[i].find('a')['href'] + temp = address_content.split('/') + artikal_number = temp[4] + olx_id.append(artikal_number) + broj_el = broj_el + 1 + +for i in range(0, 1): + podaci = filters.copy() + + +# Add kategorija + if (category_of_vehicle == 1040): podaci['Kategorija'] = ('Autobusi i minibusi') + +# Artikal olx_link + artikal_link = 'https://www.olx.ba/artikal/' + '46785631' + podaci["Olx_id"] = '46785631' + response_item = requests.get(artikal_link, headers=headers) + soup_item = BeautifulSoup(response_item.content, 'html.parser') + result_item = soup_item.find('div',{'class':'artikal_lijevo'}) + print(artikal_link) + + # Getting filters info from item + + # Osnovni filteri + +# Cijena + if (result_item.find('div',{'id':'pc'})): + x_cijena = result_item.find('div',{'id':'pc'}).findAll('p') + item_cijena = x_cijena[1].get_text().split()[0] + if(item_cijena == 'Po'): + item_cijena = "Po dogovoru" + podaci['Cijena'] = item_cijena + print(podaci['Cijena']) + + + + # Lokacija + #kanton + if (result_item.find('div',{'class':'mobile-lokacija'})): + x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split() + item_kanton = x_lokacija[0].replace(',','') + podaci['Lokacija_kanton'] = item_kanton + print(podaci['Lokacija_kanton']) + + #grad + x_lokacija.pop(0) + item_grad = x_lokacija + mojstring = ' '.join(item_grad) + podaci['Lokacija_grad'] = mojstring + print(podaci['Lokacija_grad']) + + # Brand + + if (result_item.find('div',{'itemprop':'brand'})): + x_brend = result_item.find('div',{'itemprop':'brand'}).find('a').get_text() + podaci['Brend'] = x_brend + +# Stanje + if (result_item.find('div',{'class':'mobile-stanje'})): + x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split() + item_stanje = x_stanje[1] + podaci['Stanje'] = item_stanje + print(podaci['Stanje']) + + # Dodatna polja + if (result_item.find_all('div',{'id':'dodatnapolja1'})): + dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'}) + for i in range (0,len(dodatnapolja_all_divs)): + df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'}) + for j in range (0,len(df_pom)): + df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text() + if (df_pom[j].find('div',{'class','df2'}).find('i')): + df_pom2 = True + else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text() + podaci[df_pom1] = df_pom2 + print(df_pom1 + ' : ' + str(df_pom2)) + + # Vrijeme i datum + if (result_item.find('time', {'class' : 'entry-date'})): + date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split() + datum = date_time_div[0] + vrijeme = date_time_div[2] + podaci["Datum"] = datum + podaci["Vrijeme"] = vrijeme + # print(podaci["Datum"], podaci["Vrijeme"]) + + + # Insert datas to database + dictionary_copy = podaci.copy() + podaci_db.append(dictionary_copy) + + + + +# ------------- CREATE PANDAS DATAFRAME - DICTIONARY -------------- + +# olx_db = pd.DataFrame(podaci_db) # treba biti niz +# # print(olx_db) + +# olx_db.to_excel('test-autobusi.xlsx',index=False) diff --git a/kivi_cars/allcrawlers/busevicrawler/olxbusevicrawler.py b/kivi_cars/allcrawlers/busevicrawler/olxbusevicrawler.py new file mode 100644 index 0000000..8fcff1b --- /dev/null +++ b/kivi_cars/allcrawlers/busevicrawler/olxbusevicrawler.py @@ -0,0 +1,178 @@ +from datetime import date +from bs4 import BeautifulSoup +from urllib import response +from urllib import request +import requests +import pandas as pd + + +user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36' + +headers = {'User-Agent': user_agent} + + + +# Array of object filteri +podaci_db = [] + +# Pagination cross webpages +# n is number of pages to crawl +pages_number_to_crawl = 2 +for i in range(1,pages_number_to_crawl): + # if kategorija=18 - Automobili + main_website = 'https://www.olx.ba/pretraga?kategorija=1040&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i) + category_of_vehicle = 1040 + +# Request to website +response_for_page = requests.get(main_website, headers=headers) + +# Soup object +soup_page = BeautifulSoup(response_for_page.content, 'html.parser') + + + + # Results +results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'}) + +# List of olx id +olx_id = [] + +# All filters + +filters = { + "Olx_id" : None, + "Kategorija" : None, + "Cijena" : None, + "Stanje" : None, + "Lokacija_kanton" : None, + "Lokacija_grad" : None, + "Brend" : None, + "Kilometraža" : None, + "Gorivo" : None, + "Tip" : None, + "Godište" : None, + "Kilovata (KW)" : None, + "Konjskih snaga" : None, + "Kubikaža" : None, + "Boja" : None, + "Muzika / ozvučenje" : None, + "Transmisija" : None, + "Model" : None, + "El. podizači stakala" : None, + "Električni retrovizori" : None, + "Klima" : None, + "Metalik" : None, + "Navigacija" : None, + "Ocarinjen" : None, + "Registrovan" : None, + "Servisna knjiga" : None, + "Udaren" : None, + "Xenon svjetla" : None, + "Datum" : None, + "Vrijeme" : None + } + + +# Number of all items +broj_el = 0 + +# Getting all id's of articles + +for i in range(0, len(results_all_items_per_page)): + if(results_all_items_per_page[i].find('p')): + + +# Divide id from rest of link + address_content = results_all_items_per_page[i].find('a')['href'] + temp = address_content.split('/') + artikal_number = temp[4] + olx_id.append(artikal_number) + broj_el = broj_el + 1 + +for i in range(0, broj_el): + podaci = filters.copy() + + +# Add kategorija + if (category_of_vehicle == 1040): podaci['Kategorija'] = ('Autobusi i minibusi') + +# Artikal olx_link + artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i] + podaci["Olx_id"] = olx_id[i] + response_item = requests.get(artikal_link, headers=headers) + soup_item = BeautifulSoup(response_item.content, 'html.parser') + result_item = soup_item.find('div',{'class':'artikal_lijevo'}) + + # Getting filters info from item + + # Osnovni filteri + +# Cijena + if (result_item.find('div',{'id':'pc'})): + x_cijena = result_item.find('div',{'id':'pc'}).findAll('p') + item_cijena = x_cijena[1].get_text().split()[0] + if(item_cijena == 'Po'): + item_cijena = "Po dogovoru" + podaci['Cijena'] = item_cijena + + + + # Lokacija + #kanton + if (result_item.find('div',{'class':'mobile-lokacija'})): + x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split() + item_kanton = x_lokacija[0].replace(',','') + podaci['Lokacija_kanton'] = item_kanton + + #grad + x_lokacija.pop(0) + item_grad = x_lokacija + mojstring = ' '.join(item_grad) + podaci['Lokacija_grad'] = mojstring + + # Brand + + if (result_item.find('div',{'itemprop':'brand'})): + x_brend = result_item.find('div',{'itemprop':'brand'}).find('a').get_text() + podaci['Brend'] = x_brend + +# Stanje + if (result_item.find('div',{'class':'mobile-stanje'})): + x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split() + item_stanje = x_stanje[1] + podaci['Stanje'] = item_stanje + + # Dodatna polja + if (result_item.find_all('div',{'id':'dodatnapolja1'})): + dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'}) + for i in range (0,len(dodatnapolja_all_divs)): + df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'}) + for j in range (0,len(df_pom)): + df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text() + if (df_pom[j].find('div',{'class','df2'}).find('i')): + df_pom2 = True + else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text() + podaci[df_pom1] = df_pom2 + + # Vrijeme i datum + if (result_item.find('time', {'class' : 'entry-date'})): + date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split() + datum = date_time_div[0] + vrijeme = date_time_div[2] + podaci["Datum"] = datum + podaci["Vrijeme"] = vrijeme + + + # Insert datas to database + dictionary_copy = podaci.copy() + podaci_db.append(dictionary_copy) + + + + +# ------------- CREATE PANDAS DATAFRAME - DICTIONARY -------------- + +olx_db = pd.DataFrame(podaci_db) # treba biti niz +# print(olx_db) + +olx_db.to_excel('test-autobusi.xlsx',index=False) diff --git a/kivi_cars/allcrawlers/busevicrawler/test_bus.py b/kivi_cars/allcrawlers/busevicrawler/test_bus.py new file mode 100644 index 0000000..57cd3e1 --- /dev/null +++ b/kivi_cars/allcrawlers/busevicrawler/test_bus.py @@ -0,0 +1,185 @@ +from datetime import date +from bs4 import BeautifulSoup +from urllib import response +from urllib import request +import requests +import pandas as pd + + +user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36' + +headers = {'User-Agent': user_agent} + + + +# Array of object filteri +podaci_db = [] + +# Pagination cross webpages +# n is number of pages to crawl +pages_number_to_crawl = 2 +for i in range(1,pages_number_to_crawl): + # if kategorija=18 - Automobili + main_website = 'https://www.olx.ba/pretraga?kategorija=1040&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i) + category_of_vehicle = 1040 + +# Request to website +response_for_page = requests.get(main_website, headers=headers) + +# Soup object +soup_page = BeautifulSoup(response_for_page.content, 'html.parser') + + + + # Results +results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'}) + +# List of olx id +olx_id = [] + +# All filters + +filters = { + "Olx_id" : None, + "Kategorija" : None, + "Cijena" : None, + "Stanje" : None, + "Lokacija_kanton" : None, + "Lokacija_grad" : None, + "Brend" : None, + "Kilometraža" : None, + "Gorivo" : None, + "Tip" : None, + "Godište" : None, + "Kilovata (KW)" : None, + "Konjskih snaga" : None, + "Kubikaža" : None, + "Boja" : None, + "Muzika / ozvučenje" : None, + "Transmisija" : None, + "Model" : None, + "El. podizači stakala" : None, + "Električni retrovizori" : None, + "Klima" : None, + "Metalik" : None, + "Navigacija" : None, + "Ocarinjen" : None, + "Registrovan" : None, + "Servisna knjiga" : None, + "Udaren" : None, + "Xenon svjetla" : None, + "Datum" : None, + "Vrijeme" : None + } + + +# Number of all items +broj_el = 0 + +# Getting all id's of articles + +for i in range(0, len(results_all_items_per_page)): + if(results_all_items_per_page[i].find('p')): + + +# Divide id from rest of link + address_content = results_all_items_per_page[i].find('a')['href'] + temp = address_content.split('/') + artikal_number = temp[4] + olx_id.append(artikal_number) + broj_el = broj_el + 1 + +for i in range(0, broj_el): + podaci = filters.copy() + + +# Add kategorija + if (category_of_vehicle == 1040): podaci['Kategorija'] = ('Autobusi i minibusi') + +# Artikal olx_link + artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i] + podaci["Olx_id"] = olx_id[i] + response_item = requests.get(artikal_link, headers=headers) + soup_item = BeautifulSoup(response_item.content, 'html.parser') + result_item = soup_item.find('div',{'class':'artikal_lijevo'}) + print(artikal_link) + + # Getting filters info from item + + # Osnovni filteri + +# Cijena + if (result_item.find('div',{'id':'pc'})): + x_cijena = result_item.find('div',{'id':'pc'}).findAll('p') + item_cijena = x_cijena[1].get_text().split()[0] + if(item_cijena == 'Po'): + item_cijena = "Po dogovoru" + podaci['Cijena'] = item_cijena + print(podaci['Cijena']) + + + + # Lokacija + #kanton + if (result_item.find('div',{'class':'mobile-lokacija'})): + x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split() + item_kanton = x_lokacija[0].replace(',','') + podaci['Lokacija_kanton'] = item_kanton + print(podaci['Lokacija_kanton']) + + #grad + x_lokacija.pop(0) + item_grad = x_lokacija + mojstring = ' '.join(item_grad) + podaci['Lokacija_grad'] = mojstring + print(podaci['Lokacija_grad']) + + # Brand + + if (result_item.find('div',{'itemprop':'brand'})): + x_brend = result_item.find('div',{'itemprop':'brand'}).find('a').get_text() + podaci['Brend'] = x_brend + +# Stanje + if (result_item.find('div',{'class':'mobile-stanje'})): + x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split() + item_stanje = x_stanje[1] + podaci['Stanje'] = item_stanje + print(podaci['Stanje']) + + # Dodatna polja + if (result_item.find_all('div',{'id':'dodatnapolja1'})): + dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'}) + for i in range (0,len(dodatnapolja_all_divs)): + df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'}) + for j in range (0,len(df_pom)): + df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text() + if (df_pom[j].find('div',{'class','df2'}).find('i')): + df_pom2 = True + else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text() + podaci[df_pom1] = df_pom2 + #print(df_pom1 + ' : ' + str(df_pom2)) + + # Vrijeme i datum + if (result_item.find('time', {'class' : 'entry-date'})): + date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split() + datum = date_time_div[0] + vrijeme = date_time_div[2] + podaci["Datum"] = datum + podaci["Vrijeme"] = vrijeme + # print(podaci["Datum"], podaci["Vrijeme"]) + + + # Insert datas to database + dictionary_copy = podaci.copy() + podaci_db.append(dictionary_copy) + + + + +# ------------- CREATE PANDAS DATAFRAME - DICTIONARY -------------- + +olx_db = pd.DataFrame(podaci_db) # treba biti niz +# print(olx_db) + +olx_db.to_excel('test-autobusi.xlsx',index=False) diff --git a/kivi_cars/allcrawlers/kampercrawler/olxkampericrawler.py b/kivi_cars/allcrawlers/kampercrawler/olxkampericrawler.py new file mode 100644 index 0000000..4c790b6 --- /dev/null +++ b/kivi_cars/allcrawlers/kampercrawler/olxkampericrawler.py @@ -0,0 +1,221 @@ +# ----------Imports------------ +from datetime import date +from traceback import print_tb +from unittest import result +from urllib import response +from urllib.request import Request +from bs4 import BeautifulSoup +from matplotlib import dates +from numpy import diag_indices +import requests +import pandas as pd +import random + +# List of User-Agent +user_agent_list = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36', + 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1', + 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363', +] + +user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)] +headers = {'User-Agent': user_agent} + +# Array of object filteri + +podaci_db = [] + +# Pagination cross webpages +# n is number of pages to crawl +pages_number_to_crawl = 2 +for i in range(1,pages_number_to_crawl): + # if kategorija=18 - Automobili + main_website = 'https://www.olx.ba/pretraga?kategorija=883&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i) + category_of_vehicle = 883 + + # Request to website + response_for_page = requests.get(main_website, headers=headers) + + # Soup object + soup_page = BeautifulSoup(response_for_page.content, 'html.parser') + + # Results + results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'}) + + # List of olx id + olx_id = [] + + # All filters + filters = { + "Olx_id" : None, + "Kategorija" : None, + "Cijena" : None, + "Stanje" : None, + "Lokacija_kanton" : None, + "Lokacija_grad" : None, + "Brend" : None, + "Godište" : None, + "Kilometraža" : None, + "Vrsta" : None, + "Gorivo" : None, + "Konjskih snaga" : None, + "Kilovata (KW)" : None, + "Kubikaža" : None, + "Masa/Težina (kg)" : None, + "Airbag" : None, + "Broj prozora" : None, + "Mjesta za spavanje" : None, + "Pogon" : None, + "Veličina felgi" : None, + "Transmisija" : None, + "Boja" : None, + "Muzika" : None, + "Otvor na krovu" : None, + "Model" : None, + "Alarm" : None, + "Daljinsko otključavanje" : None, + "Registrovan" : None, + "Metalik" : None, + "Servisna knjiga" : None, + "El. podizači stakala" : None, + "Tempomat" : None, + "Servo volan" : None, + "Komande na volanu" : None, + "Navigacija" : None, + "Ocarinjen" : None, + "Strane tablice" : None, + "Kuhinja" : None, + "Šporet" : None, + "Sudoper" : None, + "Frižider" : None, + "Tenda" : None, + "Kupatilo (Tuš)" : None, + "WC" : None, + "Grijanje" : None, + "Klimatizirano" : None, + "Oštećen" : None, + "Datum" : None, + "Vrijeme" : None + } + + # Number of all items + broj_el = 0 + + # Getting all id's of articles + for i in range(0, len(results_all_items_per_page)): + if(results_all_items_per_page[i].find('p')): + # Divide id from rest of link + address_content = results_all_items_per_page[i].find('a')['href'] + temp = address_content.split('/') + artikal_number = temp[4] + olx_id.append(artikal_number) + broj_el = broj_el + 1 + + for i in range(0, broj_el): + + # New dictionary instance for every item + podaci = filters.copy() + + # Add kategorija + if (category_of_vehicle == 883): podaci['Kategorija'] = ('Kamperi') + + # Artikal olx_link + artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i] + podaci["Olx_id"] = olx_id[i] + response_item = requests.get(artikal_link, headers=headers) + soup_item = BeautifulSoup(response_item.content, 'html.parser') + result_item = soup_item.find('div',{'class':'artikal_lijevo'}) + + + # Getting filters info from item + + # Osnovni filteri + + # Cijena + if (result_item.find('div',{'id':'pc'})): + x_cijena = result_item.find('div',{'id':'pc'}).findAll('p') + item_cijena = x_cijena[1].get_text().split()[0] + if(item_cijena == 'Po'): + item_cijena = "Po dogovoru" + podaci['Cijena'] = item_cijena + + # Lokacija + #kanton + if (result_item.find('div',{'class':'mobile-lokacija'})): + x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split() + item_kanton = x_lokacija[0].replace(',','') + podaci['Lokacija_kanton'] = item_kanton + + #grad + x_lokacija.pop(0) + item_grad = x_lokacija + mojstring = ' '.join(item_grad) + podaci['Lokacija_grad'] = mojstring + + # Stanje + if (result_item.find('div',{'class':'mobile-stanje'})): + x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split() + item_stanje = x_stanje[1] + podaci['Stanje'] = item_stanje + + + # Napredni filteri + + # Dodatna polja + if (result_item.find_all('div',{'id':'dodatnapolja1'})): + dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'}) + for i in range (0,len(dodatnapolja_all_divs)): + df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'}) + for j in range (0,len(df_pom)): + df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text() + if (df_pom[j].find('div',{'class','df2'}).find('i')): + df_pom2 = True + else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text() + podaci[df_pom1] = df_pom2 + + + + + # Vrijeme i datum + if (result_item.find('time', {'class' : 'entry-date'})): + date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split() + datum = date_time_div[0] + vrijeme = date_time_div[2] + podaci["Datum"] = datum + podaci["Vrijeme"] = vrijeme + + # Insert datas to database + dictionary_copy = podaci.copy() + podaci_db.append(dictionary_copy) + + + + + + +# ------------- CREATE PANDAS DATAFRAME - DICTIONARY -------------- + +olx_db = pd.DataFrame(podaci_db) # treba biti niz +# print(olx_db) + +olx_db.to_excel('test_kamperi1.xlsx',index=False) + + + + + + + + + + + + + + + + + + diff --git a/kivi_cars/allcrawlers/kampercrawler/test_kamper.py b/kivi_cars/allcrawlers/kampercrawler/test_kamper.py new file mode 100644 index 0000000..394367c --- /dev/null +++ b/kivi_cars/allcrawlers/kampercrawler/test_kamper.py @@ -0,0 +1,228 @@ +# ----------Imports------------ +from datetime import date +from traceback import print_tb +from unittest import result +from urllib import response +from urllib.request import Request +from bs4 import BeautifulSoup +from matplotlib import dates +from numpy import diag_indices +import requests +import pandas as pd +import random + +# List of User-Agent +user_agent_list = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36', + 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1', + 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363', +] + +user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)] +headers = {'User-Agent': user_agent} + +# Array of object filteri + +podaci_db = [] + +# Pagination cross webpages +# n is number of pages to crawl +pages_number_to_crawl = 2 +for i in range(1,pages_number_to_crawl): + # if kategorija=18 - Automobili + main_website = 'https://www.olx.ba/pretraga?kategorija=883&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i) + category_of_vehicle = 883 + + # Request to website + response_for_page = requests.get(main_website, headers=headers) + + # Soup object + soup_page = BeautifulSoup(response_for_page.content, 'html.parser') + + # Results + results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'}) + + # List of olx id + olx_id = [] + + # All filters + filters = { + "Olx_id" : None, + "Kategorija" : None, + "Cijena" : None, + "Stanje" : None, + "Lokacija_kanton" : None, + "Lokacija_grad" : None, + "Brend" : None, + "Godište" : None, + "Kilometraža" : None, + "Vrsta" : None, + "Gorivo" : None, + "Konjskih snaga" : None, + "Kilovata (KW)" : None, + "Kubikaža" : None, + "Masa/Težina (kg)" : None, + "Airbag" : None, + "Broj prozora" : None, + "Mjesta za spavanje" : None, + "Pogon" : None, + "Veličina felgi" : None, + "Transmisija" : None, + "Boja" : None, + "Muzika" : None, + "Otvor na krovu" : None, + "Model" : None, + "Alarm" : None, + "Daljinsko otključavanje" : None, + "Registrovan" : None, + "Metalik" : None, + "Servisna knjiga" : None, + "El. podizači stakala" : None, + "Tempomat" : None, + "Servo volan" : None, + "Komande na volanu" : None, + "Navigacija" : None, + "Ocarinjen" : None, + "Strane tablice" : None, + "Kuhinja" : None, + "Šporet" : None, + "Sudoper" : None, + "Frižider" : None, + "Tenda" : None, + "Kupatilo (Tuš)" : None, + "WC" : None, + "Grijanje" : None, + "Klimatizirano" : None, + "Oštećen" : None, + "Datum" : None, + "Vrijeme" : None + } + + # Number of all items + broj_el = 0 + + # Getting all id's of articles + for i in range(0, len(results_all_items_per_page)): + if(results_all_items_per_page[i].find('p')): + # Divide id from rest of link + address_content = results_all_items_per_page[i].find('a')['href'] + temp = address_content.split('/') + artikal_number = temp[4] + olx_id.append(artikal_number) + broj_el = broj_el + 1 + + for i in range(0, broj_el): + + # New dictionary instance for every item + podaci = filters.copy() + + # Add kategorija + if (category_of_vehicle == 883): podaci['Kategorija'] = ('Kamperi') + + # Artikal olx_link + artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i] + podaci["Olx_id"] = olx_id[i] + response_item = requests.get(artikal_link, headers=headers) + soup_item = BeautifulSoup(response_item.content, 'html.parser') + result_item = soup_item.find('div',{'class':'artikal_lijevo'}) + # print(artikal_link) + + + # Getting filters info from item + + # Osnovni filteri + + # Cijena + if (result_item.find('div',{'id':'pc'})): + x_cijena = result_item.find('div',{'id':'pc'}).findAll('p') + item_cijena = x_cijena[1].get_text().split()[0] + if(item_cijena == 'Po'): + item_cijena = "Po dogovoru" + podaci['Cijena'] = item_cijena + # print(podaci['Cijena']) + + # Lokacija + #kanton + if (result_item.find('div',{'class':'mobile-lokacija'})): + x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split() + item_kanton = x_lokacija[0].replace(',','') + podaci['Lokacija_kanton'] = item_kanton + # print(podaci['Lokacija_kanton']) + + #grad + x_lokacija.pop(0) + item_grad = x_lokacija + mojstring = ' '.join(item_grad) + podaci['Lokacija_grad'] = mojstring + # print(podaci['Lokacija_grad']) + + # Stanje + if (result_item.find('div',{'class':'mobile-stanje'})): + x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split() + item_stanje = x_stanje[1] + podaci['Stanje'] = item_stanje + # print(podaci['Stanje']) + + + # Napredni filteri + + # Dodatna polja + if (result_item.find_all('div',{'id':'dodatnapolja1'})): + dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'}) + for i in range (0,len(dodatnapolja_all_divs)): + df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'}) + for j in range (0,len(df_pom)): + df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text() + if (df_pom[j].find('div',{'class','df2'}).find('i')): + df_pom2 = True + else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text() + podaci[df_pom1] = df_pom2 + + + + + # Vrijeme i datum + if (result_item.find('time', {'class' : 'entry-date'})): + date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split() + datum = date_time_div[0] + vrijeme = date_time_div[2] + podaci["Datum"] = datum + podaci["Vrijeme"] = vrijeme + # print(podaci["Datum"], podaci["Vrijeme"]) + + # Insert datas to database + dictionary_copy = podaci.copy() + podaci_db.append(dictionary_copy) + + print('--------------------------------------------------------------------') + + + + + +# ------------- CREATE PANDAS DATAFRAME - DICTIONARY -------------- + +olx_db = pd.DataFrame(podaci_db) # treba biti niz +# print(olx_db) + +olx_db.to_excel('test_kamperi1.xlsx',index=False) + + + + + + + + + + + + + + + + + + diff --git a/kivi_cars/allcrawlers/motorcrawler/olxmotorcrawler.py b/kivi_cars/allcrawlers/motorcrawler/olxmotorcrawler.py new file mode 100644 index 0000000..eff495f --- /dev/null +++ b/kivi_cars/allcrawlers/motorcrawler/olxmotorcrawler.py @@ -0,0 +1,185 @@ +from datetime import date +from bs4 import BeautifulSoup +from urllib import response +from urllib import request +import requests +import pandas as pd + + +user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36' + +headers = {'User-Agent': user_agent} + + + +# Array of object filteri +podaci_db = [] + +# Pagination cross webpages +# n is number of pages to crawl +pages_number_to_crawl = 2 +for i in range(1,pages_number_to_crawl): + # if kategorija=18 - Automobili + main_website = 'https://www.olx.ba/pretraga?id=21&kategorija=21&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&kilometra-a_min=0&kilometra-a_max=0&stranica=' + str(i) + category_of_vehicle = 21 + +# Request to website +response_for_page = requests.get(main_website, headers=headers) + +# Soup object +soup_page = BeautifulSoup(response_for_page.content, 'html.parser') + + + + # Results +results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'}) + +# List of olx id +olx_id = [] + +# All filters + +filters = { + "Olx_id" : None, + "Kategorija" : None, + "Cijena" : None, + "Stanje" : None, + "Lokacija_kanton" : None, + "Lokacija_grad" : None, + "Brend" : None, + "Godište" : None, + "Kilometraža" : None, + "Tip/Vrsta motocikla" : None, + "Kubikaža (ccm)" : None, + "Konjskih snaga" : None, + "Masa/Težina (kg)" : None, + "Način hlađenja" : None, + "Vrsta mašine (broj taktova)" : None, + "Transmisija" : None, + "Zadnja guma (inch)" : None, + "Prednja guma (inch)" : None, + "Boja" : None, + "Model" : None, + "Registrovan" : None, + "Metalik" : None, + "Alarm" : None, + "Ocarinjen" : None, + "Udaren" : None, + "ABS" : None, + "Električni pogon (motor)" : None, + "Xenon svjetla" : None, + "Datum" : None, + "Vrijeme" : None + } + + +# Number of all items +broj_el = 0 + +# Getting all id's of articles + +for i in range(0, len(results_all_items_per_page)): + if(results_all_items_per_page[i].find('p')): + + +# Divide id from rest of link + address_content = results_all_items_per_page[i].find('a')['href'] + temp = address_content.split('/') + artikal_number = temp[4] + olx_id.append(artikal_number) + broj_el = broj_el + 1 + +for i in range(0, broj_el): + podaci = filters.copy() + + +# Add kategorija + if (category_of_vehicle == 21): podaci['Kategorija'] = ('Motocikli') + +# Artikal olx_link + artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i] + podaci["Olx_id"] = olx_id[i] + response_item = requests.get(artikal_link, headers=headers) + soup_item = BeautifulSoup(response_item.content, 'html.parser') + result_item = soup_item.find('div',{'class':'artikal_lijevo'}) + # print(artikal_link) + + # Getting filters info from item + + # Osnovni filteri + +# Cijena + if (result_item.find('div',{'id':'pc'})): + x_cijena = result_item.find('div',{'id':'pc'}).findAll('p') + item_cijena = x_cijena[1].get_text().split()[0] + if(item_cijena == 'Po'): + item_cijena = "Po dogovoru" + podaci['Cijena'] = item_cijena + # print(podaci['Cijena']) + + + + # Lokacija + #kanton + if (result_item.find('div',{'class':'mobile-lokacija'})): + x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split() + item_kanton = x_lokacija[0].replace(',','') + podaci['Lokacija_kanton'] = item_kanton + # print(podaci['Lokacija_kanton']) + + #grad + x_lokacija.pop(0) + item_grad = x_lokacija + mojstring = ' '.join(item_grad) + podaci['Lokacija_grad'] = mojstring + # print(podaci['Lokacija_grad']) + + # Brand + + if (result_item.find('div',{'itemprop':'brand'})): + x_brend = result_item.find('div',{'itemprop':'brand'}).find('a').get_text() + podaci['Brend'] = x_brend + +# Stanje + if (result_item.find('div',{'class':'mobile-stanje'})): + x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split() + item_stanje = x_stanje[1] + podaci['Stanje'] = item_stanje + # print(podaci['Stanje']) + + # Dodatna polja + if (result_item.find_all('div',{'id':'dodatnapolja1'})): + dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'}) + for i in range (0,len(dodatnapolja_all_divs)): + df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'}) + for j in range (0,len(df_pom)): + df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text() + if (df_pom[j].find('div',{'class','df2'}).find('i')): + df_pom2 = True + else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text() + podaci[df_pom1] = df_pom2 + #print(df_pom1 + ' : ' + str(df_pom2)) + + # Vrijeme i datum + if (result_item.find('time', {'class' : 'entry-date'})): + date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split() + datum = date_time_div[0] + vrijeme = date_time_div[2] + podaci["Datum"] = datum + podaci["Vrijeme"] = vrijeme + # print(podaci["Datum"], podaci["Vrijeme"]) + + + # Insert datas to database + dictionary_copy = podaci.copy() + podaci_db.append(dictionary_copy) + + + + +# ------------- CREATE PANDAS DATAFRAME - DICTIONARY -------------- + +olx_db = pd.DataFrame(podaci_db) # treba biti niz +# print(olx_db) + +olx_db.to_excel('proba-motocikli.xlsx',index=False) diff --git a/kivi_cars/allcrawlers/nautikacrawler/olxnautikacrawler.py b/kivi_cars/allcrawlers/nautikacrawler/olxnautikacrawler.py new file mode 100644 index 0000000..db1fa5e --- /dev/null +++ b/kivi_cars/allcrawlers/nautikacrawler/olxnautikacrawler.py @@ -0,0 +1,183 @@ +from datetime import date +from bs4 import BeautifulSoup +from urllib import response +from urllib import request +import requests +import pandas as pd + + +user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36' + +headers = {'User-Agent': user_agent} + + + +# Array of object filteri +podaci_db = [] + +# Pagination cross webpages +# n is number of pages to crawl +pages_number_to_crawl = 2 +for i in range(1,pages_number_to_crawl): + # if kategorija=18 - Automobili + main_website = 'https://www.olx.ba/pretraga?kategorija=426&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i) + category_of_vehicle = 426 + +# Request to website +response_for_page = requests.get(main_website, headers=headers) + +# Soup object +soup_page = BeautifulSoup(response_for_page.content, 'html.parser') + + + + # Results +results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'}) + +# List of olx id +olx_id = [] + +# All filters + +filters = { + "Olx_id" : None, + "Kategorija" : None, + "Cijena" : None, + "Stanje" : None, + "Lokacija_kanton" : None, + "Lokacija_grad" : None, + "Brend" : None, + "Dužina (m)" : None, + "Širina (m)" : None, + "Vrsta broda" : None, + "Vrsta pogona (osnovni)" : None, + "Godište" : None, + "Model" : None, + "Visina (m)" : None, + "Alternativni pogon" : None, + "Jačina motora (KS)" : None, + "Radni sati motora" : None, + "Max. brzina (čvor)" : None, + "Masa (kg)" : None, + "Nosivost (t)" : None, + "Deplasman (t)" : None, + "Broj ležaja" : None, + "Broj prostorija/soba" : None, + "Spremnik goriva (L)" : None, + "Spremnik vode (L)" : None, + "Materijal izgradnje" : None, + "Kapacitet putnika" : None, + "Audio/Video" : None, + "Bitve za vezanje" : None, + "Motor (model)" : None, + "Gaz" : None, + "Klima" : None, + "Vjetrobran" : None, + "WC" : None, + "Datum" : None, + "Vrijeme" : None + } + + +# Number of all items +broj_el = 0 + +# Getting all id's of articles + +for i in range(0, len(results_all_items_per_page)): + if(results_all_items_per_page[i].find('p')): + + +# Divide id from rest of link + address_content = results_all_items_per_page[i].find('a')['href'] + temp = address_content.split('/') + artikal_number = temp[4] + olx_id.append(artikal_number) + broj_el = broj_el + 1 + +for i in range(0, broj_el): + podaci = filters.copy() + + +# Add kategorija + if (category_of_vehicle == 426): podaci['Kategorija'] = ('Nautika') + +# Artikal olx_link + artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i] + podaci["Olx_id"] = olx_id[i] + response_item = requests.get(artikal_link, headers=headers) + soup_item = BeautifulSoup(response_item.content, 'html.parser') + result_item = soup_item.find('div',{'class':'artikal_lijevo'}) + + # Getting filters info from item + + # Osnovni filteri + +# Cijena + if (result_item.find('div',{'id':'pc'})): + x_cijena = result_item.find('div',{'id':'pc'}).findAll('p') + item_cijena = x_cijena[1].get_text().split()[0] + if(item_cijena == 'Po'): + item_cijena = "Po dogovoru" + podaci['Cijena'] = item_cijena + + + + # Lokacija + #kanton + if (result_item.find('div',{'class':'mobile-lokacija'})): + x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split() + item_kanton = x_lokacija[0].replace(',','') + podaci['Lokacija_kanton'] = item_kanton + + #grad + x_lokacija.pop(0) + item_grad = x_lokacija + mojstring = ' '.join(item_grad) + podaci['Lokacija_grad'] = mojstring + + # Brand + + if (result_item.find('div',{'itemprop':'brand'})): + x_brend = result_item.find('div',{'itemprop':'brand'}).find('a').get_text() + +# Stanje + if (result_item.find('div',{'class':'mobile-stanje'})): + x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split() + item_stanje = x_stanje[1] + podaci['Stanje'] = item_stanje + + # Dodatna polja + if (result_item.find_all('div',{'id':'dodatnapolja1'})): + dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'}) + for i in range (0,len(dodatnapolja_all_divs)): + df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'}) + for j in range (0,len(df_pom)): + df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text() + if (df_pom[j].find('div',{'class','df2'}).find('i')): + df_pom2 = True + else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text() + podaci[df_pom1] = df_pom2 + + # Vrijeme i datum + if (result_item.find('time', {'class' : 'entry-date'})): + date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split() + datum = date_time_div[0] + vrijeme = date_time_div[2] + podaci["Datum"] = datum + podaci["Vrijeme"] = vrijeme + + + # Insert datas to database + dictionary_copy = podaci.copy() + podaci_db.append(dictionary_copy) + + + + +# ------------- CREATE PANDAS DATAFRAME - DICTIONARY -------------- + +olx_db = pd.DataFrame(podaci_db) # treba biti niz +# print(olx_db) + +olx_db.to_excel('test-nautika1.xlsx',index=False) diff --git a/kivi_cars/allcrawlers/nautikacrawler/test_nautika.py b/kivi_cars/allcrawlers/nautikacrawler/test_nautika.py new file mode 100644 index 0000000..1f0ab40 --- /dev/null +++ b/kivi_cars/allcrawlers/nautikacrawler/test_nautika.py @@ -0,0 +1,191 @@ +from datetime import date +from bs4 import BeautifulSoup +from urllib import response +from urllib import request +import requests +import pandas as pd + + +user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36' + +headers = {'User-Agent': user_agent} + + + +# Array of object filteri +podaci_db = [] + +# Pagination cross webpages +# n is number of pages to crawl +pages_number_to_crawl = 2 +for i in range(1,pages_number_to_crawl): + # if kategorija=18 - Automobili + main_website = 'https://www.olx.ba/pretraga?kategorija=426&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i) + category_of_vehicle = 426 + +# Request to website +response_for_page = requests.get(main_website, headers=headers) + +# Soup object +soup_page = BeautifulSoup(response_for_page.content, 'html.parser') + + + + # Results +results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'}) + +# List of olx id +olx_id = [] + +# All filters + +filters = { + "Olx_id" : None, + "Kategorija" : None, + "Cijena" : None, + "Stanje" : None, + "Lokacija_kanton" : None, + "Lokacija_grad" : None, + "Brend" : None, + "Dužina (m)" : None, + "Širina (m)" : None, + "Vrsta broda" : None, + "Vrsta pogona (osnovni)" : None, + "Godište" : None, + "Model" : None, + "Visina (m)" : None, + "Alternativni pogon" : None, + "Jačina motora (KS)" : None, + "Radni sati motora" : None, + "Max. brzina (čvor)" : None, + "Masa (kg)" : None, + "Nosivost (t)" : None, + "Deplasman (t)" : None, + "Broj ležaja" : None, + "Broj prostorija/soba" : None, + "Spremnik goriva (L)" : None, + "Spremnik vode (L)" : None, + "Materijal izgradnje" : None, + "Kapacitet putnika" : None, + "Audio/Video" : None, + "Bitve za vezanje" : None, + "Motor (model)" : None, + "Gaz" : None, + "Klima" : None, + "Vjetrobran" : None, + "WC" : None, + "Datum" : None, + "Vrijeme" : None + } + + +# Number of all items +broj_el = 0 + +# Getting all id's of articles + +for i in range(0, len(results_all_items_per_page)): + if(results_all_items_per_page[i].find('p')): + + +# Divide id from rest of link + address_content = results_all_items_per_page[i].find('a')['href'] + temp = address_content.split('/') + artikal_number = temp[4] + olx_id.append(artikal_number) + broj_el = broj_el + 1 + +for i in range(0, broj_el): + podaci = filters.copy() + + +# Add kategorija + if (category_of_vehicle == 426): podaci['Kategorija'] = ('Nautika') + +# Artikal olx_link + artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i] + podaci["Olx_id"] = olx_id[i] + response_item = requests.get(artikal_link, headers=headers) + soup_item = BeautifulSoup(response_item.content, 'html.parser') + result_item = soup_item.find('div',{'class':'artikal_lijevo'}) + print(artikal_link) + + # Getting filters info from item + + # Osnovni filteri + +# Cijena + if (result_item.find('div',{'id':'pc'})): + x_cijena = result_item.find('div',{'id':'pc'}).findAll('p') + item_cijena = x_cijena[1].get_text().split()[0] + if(item_cijena == 'Po'): + item_cijena = "Po dogovoru" + podaci['Cijena'] = item_cijena + print(podaci['Cijena']) + + + + # Lokacija + #kanton + if (result_item.find('div',{'class':'mobile-lokacija'})): + x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split() + item_kanton = x_lokacija[0].replace(',','') + podaci['Lokacija_kanton'] = item_kanton + print(podaci['Lokacija_kanton']) + + #grad + x_lokacija.pop(0) + item_grad = x_lokacija + mojstring = ' '.join(item_grad) + podaci['Lokacija_grad'] = mojstring + print(podaci['Lokacija_grad']) + + # Brand + + if (result_item.find('div',{'itemprop':'brand'})): + x_brend = result_item.find('div',{'itemprop':'brand'}).find('a').get_text() + podaci['Brend'] = x_brend + +# Stanje + if (result_item.find('div',{'class':'mobile-stanje'})): + x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split() + item_stanje = x_stanje[1] + podaci['Stanje'] = item_stanje + print(podaci['Stanje']) + + # Dodatna polja + if (result_item.find_all('div',{'id':'dodatnapolja1'})): + dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'}) + for i in range (0,len(dodatnapolja_all_divs)): + df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'}) + for j in range (0,len(df_pom)): + df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text() + if (df_pom[j].find('div',{'class','df2'}).find('i')): + df_pom2 = True + else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text() + podaci[df_pom1] = df_pom2 + #print(df_pom1 + ' : ' + str(df_pom2)) + + # Vrijeme i datum + if (result_item.find('time', {'class' : 'entry-date'})): + date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split() + datum = date_time_div[0] + vrijeme = date_time_div[2] + podaci["Datum"] = datum + podaci["Vrijeme"] = vrijeme + # print(podaci["Datum"], podaci["Vrijeme"]) + + + # Insert datas to database + dictionary_copy = podaci.copy() + podaci_db.append(dictionary_copy) + + + + +# ------------- CREATE PANDAS DATAFRAME - DICTIONARY -------------- + +olx_db = pd.DataFrame(podaci_db) # treba biti niz +# print(olx_db) + +olx_db.to_excel('test-nautika1.xlsx',index=False) diff --git a/kivi_cars/allcrawlers/prikolicacrawler/olxprikolicacrawler.py b/kivi_cars/allcrawlers/prikolicacrawler/olxprikolicacrawler.py new file mode 100644 index 0000000..7a8ef0c --- /dev/null +++ b/kivi_cars/allcrawlers/prikolicacrawler/olxprikolicacrawler.py @@ -0,0 +1,178 @@ +# ----------Imports------------ +from datetime import date +from traceback import print_tb +from unittest import result +from urllib import response +from urllib.request import Request +from bs4 import BeautifulSoup +from matplotlib import dates +from numpy import diag_indices +import requests +import pandas as pd +import random + +# List of User-Agent +user_agent_list = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36', + 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1', + 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363', +] + +user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)] +headers = {'User-Agent': user_agent} + +# Array of object filteri + +podaci_db = [] + +# Pagination cross webpages +# n is number of pages to crawl +pages_number_to_crawl = 2 +for i in range(1,pages_number_to_crawl): + # if kategorija=18 - Automobili + main_website = 'https://www.olx.ba/pretraga?kategorija=884&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&stranica=' + str(i) + category_of_vehicle = 884 + + # Request to website + response_for_page = requests.get(main_website, headers=headers) + + # Soup object + soup_page = BeautifulSoup(response_for_page.content, 'html.parser') + + # Results + results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'}) + + # List of olx id + olx_id = [] + + # All filters + filters = { + "Olx_id" : None, + "Kategorija" : None, + "Cijena" : None, + "Stanje" : None, + "Lokacija_kanton" : None, + "Lokacija_grad" : None, + "Brend" : None, + "Nosivost (kg)" : None, + "Vrsta" : None, + "Godina proizvodnje" : None, + "Masa/Težina (kg)" : None, + "Boja" : None, + "Model" : None, + "Zasebne kočnice" : None, + "Registrovana" : None, + "Ocarinjena" : None, + "Datum" : None, + "Vrijeme" : None + } + + # Number of all items + broj_el = 0 + + # Getting all id's of articles + for i in range(0, len(results_all_items_per_page)): + if(results_all_items_per_page[i].find('p')): + # Divide id from rest of link + address_content = results_all_items_per_page[i].find('a')['href'] + temp = address_content.split('/') + artikal_number = temp[4] + olx_id.append(artikal_number) + broj_el = broj_el + 1 + + for i in range(0, broj_el): + + # New dictionary instance for every item + podaci = filters.copy() + + # Add kategorija + if (category_of_vehicle == 884): podaci['Kategorija'] = ('Prikolice') + + # Artikal olx_link + artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i] + podaci["Olx_id"] = olx_id[i] + response_item = requests.get(artikal_link, headers=headers) + soup_item = BeautifulSoup(response_item.content, 'html.parser') + result_item = soup_item.find('div',{'class':'artikal_lijevo'}) + + + # Getting filters info from item + + # Osnovni filteri + + # Cijena + if (result_item.find('div',{'id':'pc'})): + x_cijena = result_item.find('div',{'id':'pc'}).findAll('p') + item_cijena = x_cijena[1].get_text().split()[0] + if(item_cijena == 'Po'): + item_cijena = "Po dogovoru" + podaci['Cijena'] = item_cijena + + # Lokacija + #kanton + if (result_item.find('div',{'class':'mobile-lokacija'})): + x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split() + item_kanton = x_lokacija[0].replace(',','') + podaci['Lokacija_kanton'] = item_kanton + + #grad + x_lokacija.pop(0) + item_grad = x_lokacija + mojstring = ' '.join(item_grad) + podaci['Lokacija_grad'] = mojstring + + # Stanje + if (result_item.find('div',{'class':'mobile-stanje'})): + x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split() + item_stanje = x_stanje[1] + podaci['Stanje'] = item_stanje + + # Brand + + if (result_item.find_all('div',{'itemprop':'brand'})): + x_brand = result_item.find('div',{'itemprop':'brand'}).find('a').get_text() + podaci['Brend'] = x_brand + + + # Napredni filteri + + # Dodatna polja + if (result_item.find_all('div',{'id':'dodatnapolja1'})): + dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'}) + for i in range (0,len(dodatnapolja_all_divs)): + df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'}) + for j in range (0,len(df_pom)): + df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text() + if (df_pom[j].find('div',{'class','df2'}).find('i')): + df_pom2 = True + else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text() + podaci[df_pom1] = df_pom2 + + + # Vrijeme i datum + if (result_item.find('time', {'class' : 'entry-date'})): + date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split() + datum = date_time_div[0] + vrijeme = date_time_div[2] + podaci["Datum"] = datum + podaci["Vrijeme"] = vrijeme + + + # Insert datas to database + dictionary_copy = podaci.copy() + podaci_db.append(dictionary_copy) + + + + +# ------------- CREATE PANDAS DATAFRAME - DICTIONARY -------------- + +olx_db = pd.DataFrame(podaci_db) # treba biti niz +# print(olx_db) + +olx_db.to_excel('test_prikolica1.xlsx',index=False) + +print("Zavrseno!!") + diff --git a/kivi_cars/allcrawlers/prikolicacrawler/test_prikolica.py b/kivi_cars/allcrawlers/prikolicacrawler/test_prikolica.py new file mode 100644 index 0000000..121fdfc --- /dev/null +++ b/kivi_cars/allcrawlers/prikolicacrawler/test_prikolica.py @@ -0,0 +1,187 @@ +# ----------Imports------------ +from datetime import date +from traceback import print_tb +from unittest import result +from urllib import response +from urllib.request import Request +from bs4 import BeautifulSoup +from matplotlib import dates +from numpy import diag_indices +import requests +import pandas as pd +import random + +# List of User-Agent +user_agent_list = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36', + 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1', + 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363', +] + +user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)] +headers = {'User-Agent': user_agent} + +# Array of object filteri + +podaci_db = [] + +# Pagination cross webpages +# n is number of pages to crawl +pages_number_to_crawl = 2 +for i in range(1,pages_number_to_crawl): + # if kategorija=18 - Automobili + main_website = 'https://www.olx.ba/pretraga?kategorija=884&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&stranica=' + str(i) + category_of_vehicle = 884 + + # Request to website + response_for_page = requests.get(main_website, headers=headers) + + # Soup object + soup_page = BeautifulSoup(response_for_page.content, 'html.parser') + + # Results + results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'}) + + # List of olx id + olx_id = [] + + # All filters + filters = { + "Olx_id" : None, + "Kategorija" : None, + "Cijena" : None, + "Stanje" : None, + "Lokacija_kanton" : None, + "Lokacija_grad" : None, + "Brend" : None, + "Nosivost (kg)" : None, + "Vrsta" : None, + "Godina proizvodnje" : None, + "Masa/Težina (kg)" : None, + "Boja" : None, + "Model" : None, + "Zasebne kočnice" : None, + "Registrovana" : None, + "Ocarinjena" : None, + "Datum" : None, + "Vrijeme" : None + } + + # Number of all items + broj_el = 0 + + # Getting all id's of articles + for i in range(0, len(results_all_items_per_page)): + if(results_all_items_per_page[i].find('p')): + # Divide id from rest of link + address_content = results_all_items_per_page[i].find('a')['href'] + temp = address_content.split('/') + artikal_number = temp[4] + olx_id.append(artikal_number) + broj_el = broj_el + 1 + + for i in range(0, broj_el): + + # New dictionary instance for every item + podaci = filters.copy() + + # Add kategorija + if (category_of_vehicle == 884): podaci['Kategorija'] = ('Prikolice') + + # Artikal olx_link + artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i] + podaci["Olx_id"] = olx_id[i] + response_item = requests.get(artikal_link, headers=headers) + soup_item = BeautifulSoup(response_item.content, 'html.parser') + result_item = soup_item.find('div',{'class':'artikal_lijevo'}) + # print(artikal_link) + + + # Getting filters info from item + + # Osnovni filteri + + # Cijena + if (result_item.find('div',{'id':'pc'})): + x_cijena = result_item.find('div',{'id':'pc'}).findAll('p') + item_cijena = x_cijena[1].get_text().split()[0] + if(item_cijena == 'Po'): + item_cijena = "Po dogovoru" + podaci['Cijena'] = item_cijena + # print(podaci['Cijena']) + + # Lokacija + #kanton + if (result_item.find('div',{'class':'mobile-lokacija'})): + x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split() + item_kanton = x_lokacija[0].replace(',','') + podaci['Lokacija_kanton'] = item_kanton + # print(podaci['Lokacija_kanton']) + + #grad + x_lokacija.pop(0) + item_grad = x_lokacija + mojstring = ' '.join(item_grad) + podaci['Lokacija_grad'] = mojstring + # print(podaci['Lokacija_grad']) + + # Stanje + if (result_item.find('div',{'class':'mobile-stanje'})): + x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split() + item_stanje = x_stanje[1] + podaci['Stanje'] = item_stanje + # print(podaci['Stanje']) + + #-------------------------------------------------------------------------------------- + + # Brand + + if (result_item.find_all('div',{'itemprop':'brand'})): + x_brand = result_item.find('div',{'itemprop':'brand'}).find('a').get_text() + podaci['Brend'] = x_brand + # print(x_brand) + + + # Napredni filteri + + # Dodatna polja + if (result_item.find_all('div',{'id':'dodatnapolja1'})): + dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'}) + for i in range (0,len(dodatnapolja_all_divs)): + df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'}) + for j in range (0,len(df_pom)): + df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text() + if (df_pom[j].find('div',{'class','df2'}).find('i')): + df_pom2 = True + else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text() + podaci[df_pom1] = df_pom2 + + + # Vrijeme i datum + if (result_item.find('time', {'class' : 'entry-date'})): + date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split() + datum = date_time_div[0] + vrijeme = date_time_div[2] + podaci["Datum"] = datum + podaci["Vrijeme"] = vrijeme + # print(podaci["Datum"], podaci["Vrijeme"]) + + + # Insert datas to database + dictionary_copy = podaci.copy() + podaci_db.append(dictionary_copy) + + + + +# ------------- CREATE PANDAS DATAFRAME - DICTIONARY -------------- + +olx_db = pd.DataFrame(podaci_db) # treba biti niz +# print(olx_db) + +olx_db.to_excel('test_prikolica1.xlsx',index=False) + +print("Zavrseno!!") + diff --git a/kivi_cars/allcrawlers/teretnacrawler/olxteretnacrawler.py b/kivi_cars/allcrawlers/teretnacrawler/olxteretnacrawler.py new file mode 100644 index 0000000..5fd1c18 --- /dev/null +++ b/kivi_cars/allcrawlers/teretnacrawler/olxteretnacrawler.py @@ -0,0 +1,223 @@ +# ----------Imports------------ +from datetime import date +from traceback import print_tb +from unittest import result +from urllib import response +from urllib.request import Request +from bs4 import BeautifulSoup +from matplotlib import dates +from numpy import diag_indices +import requests +import pandas as pd +import random + +# List of User-Agent +user_agent_list = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36', + 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1', + 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363', +] + +user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)] +headers = {'User-Agent': user_agent} + +# Array of object filteri + +podaci_db = [] + +# Pagination cross webpages +# n is number of pages to crawl +pages_number_to_crawl = 2 +for i in range(1,pages_number_to_crawl): + # if kategorija=18 - Automobili + main_website = 'https://www.olx.ba/pretraga?kategorija=20&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i) + category_of_vehicle = 20 + + # Request to website + response_for_page = requests.get(main_website, headers=headers) + + # Soup object + soup_page = BeautifulSoup(response_for_page.content, 'html.parser') + + # Results + results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'}) + + # List of olx id + olx_id = [] + + # All filters + filters = { + "Olx_id" : None, + "Kategorija" : None, + "Cijena" : None, + "Stanje" : None, + "Lokacija_kanton" : None, + "Lokacija_grad" : None, + "Brend" : None, + "Godište" : None, + "Kilometraža" : None, + "Tip" : None, + "Broj osovina" : None, + "Gorivo" : None, + "Konjskih snaga" : None, + "Kilovata (KW)" : None, + "Masa/Težina (kg)" : None, + "Ukupna dozvoljena masa (t)" : None, + "Dužina tovarnog prostora" : None, + "Širina tovarnog prostora" : None, + "Visina tovarnog prostora" : None, + "Emisioni standard" : None, + "Vrsta pogona" : None, + "Transmisija" : None, + "Nosivost (tona)" : None, + "Boja" : None, + "Muzika / ozvučenje" : None, + "Registrovan do" : None, + "Model" : None, + "Strane tablice" : None, + "Sa kranom" : None, + "Metalik" : None, + "Udaren" : None, + "Registrovan/Ocarinjen" : None, + "Servisna knjiga" : None, + "Servo volan" : None, + "El. podizači stakala" : None, + "Električni retrovizori" : None, + "Klima" : None, + "Navigacija" : None, + "Koža" : None, + "Xenon svjetla" : None, + "Alarm" : None, + "Daljinsko otključavanje" : None, + "Centralna brava" : None, + "Dupla kabina" : None, + "Datum" : None, + "Vrijeme" : None + } + + # Number of all items + broj_el = 0 + + # Getting all id's of articles + for i in range(0, len(results_all_items_per_page)): + if(results_all_items_per_page[i].find('p')): + # Divide id from rest of link + address_content = results_all_items_per_page[i].find('a')['href'] + temp = address_content.split('/') + artikal_number = temp[4] + olx_id.append(artikal_number) + broj_el = broj_el + 1 + + for i in range(0, broj_el): + + # New dictionary instance for every item + podaci = filters.copy() + + # Add kategorija + if (category_of_vehicle == 20): podaci['Kategorija'] = ('Teretna vozila') + + # Artikal olx_link + artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i] + podaci["Olx_id"] = olx_id[i] + response_item = requests.get(artikal_link, headers=headers) + soup_item = BeautifulSoup(response_item.content, 'html.parser') + result_item = soup_item.find('div',{'class':'artikal_lijevo'}) + # print(artikal_link) + + + # Getting filters info from item + + # Osnovni filteri + + # Cijena + if (result_item.find('div',{'id':'pc'})): + x_cijena = result_item.find('div',{'id':'pc'}).findAll('p') + item_cijena = x_cijena[1].get_text().split()[0] + if(item_cijena == 'Po'): + item_cijena = "Po dogovoru" + podaci['Cijena'] = item_cijena + + # Lokacija + #kanton + if (result_item.find('div',{'class':'mobile-lokacija'})): + x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split() + item_kanton = x_lokacija[0].replace(',','') + podaci['Lokacija_kanton'] = item_kanton + + #grad + x_lokacija.pop(0) + item_grad = x_lokacija + mojstring = ' '.join(item_grad) + podaci['Lokacija_grad'] = mojstring + + # Stanje + if (result_item.find('div',{'class':'mobile-stanje'})): + x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split() + item_stanje = x_stanje[1] + podaci['Stanje'] = item_stanje + + # Brand + if (result_item.find_all('div',{'itemprop':'brand'})): + x_brand = result_item.find('div',{'itemprop':'brand'}).find('a').get_text() + podaci['Brend'] = x_brand + + + # Napredni filteri + + # Dodatna polja + if (result_item.find_all('div',{'id':'dodatnapolja1'})): + dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'}) + for i in range (0,len(dodatnapolja_all_divs)): + df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'}) + for j in range (0,len(df_pom)): + df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text() + if (df_pom[j].find('div',{'class','df2'}).find('i')): + df_pom2 = True + else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text() + podaci[df_pom1] = df_pom2 + + + # Vrijeme i datum + if (result_item.find('time', {'class' : 'entry-date'})): + date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split() + datum = date_time_div[0] + vrijeme = date_time_div[2] + podaci["Datum"] = datum + podaci["Vrijeme"] = vrijeme + + + # Insert datas to database + dictionary_copy = podaci.copy() + podaci_db.append(dictionary_copy) + + + + +# ------------- CREATE PANDAS DATAFRAME - DICTIONARY -------------- + +olx_db = pd.DataFrame(podaci_db) # treba biti niz +# print(olx_db) + +olx_db.to_excel('test_teretna2.xlsx',index=False) + +print("Zavrseno!!") + + + + + + + + + + + + + + + + + + diff --git a/kivi_cars/allcrawlers/teretnacrawler/test_teretna.py b/kivi_cars/allcrawlers/teretnacrawler/test_teretna.py new file mode 100644 index 0000000..3fcc26c --- /dev/null +++ b/kivi_cars/allcrawlers/teretnacrawler/test_teretna.py @@ -0,0 +1,240 @@ +# ----------Imports------------ +from datetime import date +from traceback import print_tb +from unittest import result +from urllib import response +from urllib.request import Request +from bs4 import BeautifulSoup +from matplotlib import dates +from numpy import diag_indices +import requests +import pandas as pd +import random + +# List of User-Agent +user_agent_list = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36', + 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1', + 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363', +] + +user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)] +headers = {'User-Agent': user_agent} + +# Array of object filteri + +podaci_db = [] + +# Pagination cross webpages +# n is number of pages to crawl +pages_number_to_crawl = 2 +for i in range(1,pages_number_to_crawl): + # if kategorija=18 - Automobili + main_website = 'https://www.olx.ba/pretraga?kategorija=20&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i) + category_of_vehicle = 20 + + # Request to website + response_for_page = requests.get(main_website, headers=headers) + + # Soup object + soup_page = BeautifulSoup(response_for_page.content, 'html.parser') + + # Results + results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'}) + + # List of olx id + olx_id = [] + + # All filters + filters = { + "Olx_id" : None, + "Kategorija" : None, + "Cijena" : None, + "Stanje" : None, + "Lokacija_kanton" : None, + "Lokacija_grad" : None, + "Brend" : None, + "Godište" : None, + "Kilometraža" : None, + "Tip" : None, + "Broj osovina" : None, + "Gorivo" : None, + "Konjskih snaga" : None, + "Kilovata (KW)" : None, + "Masa/Težina (kg)" : None, + "Ukupna dozvoljena masa (t)" : None, + "Dužina tovarnog prostora" : None, + "Širina tovarnog prostora" : None, + "Visina tovarnog prostora" : None, + "Emisioni standard" : None, + "Vrsta pogona" : None, + "Transmisija" : None, + "Nosivost (tona)" : None, + "Boja" : None, + "Muzika / ozvučenje" : None, + "Registrovan do" : None, + "Model" : None, + "Strane tablice" : None, + "Sa kranom" : None, + "Metalik" : None, + "Udaren" : None, + "Registrovan/Ocarinjen" : None, + "Servisna knjiga" : None, + "Servo volan" : None, + "El. podizači stakala" : None, + "Električni retrovizori" : None, + "Klima" : None, + "Navigacija" : None, + "Koža" : None, + "Xenon svjetla" : None, + "Alarm" : None, + "Daljinsko otključavanje" : None, + "Centralna brava" : None, + "Dupla kabina" : None, + "Datum" : None, + "Vrijeme" : None + } + + # Number of all items + broj_el = 0 + + # Getting all id's of articles + for i in range(0, len(results_all_items_per_page)): + if(results_all_items_per_page[i].find('p')): + # Divide id from rest of link + address_content = results_all_items_per_page[i].find('a')['href'] + temp = address_content.split('/') + artikal_number = temp[4] + olx_id.append(artikal_number) + broj_el = broj_el + 1 + + for i in range(0, broj_el): + + # New dictionary instance for every item + podaci = filters.copy() + + # Add kategorija + if (category_of_vehicle == 20): podaci['Kategorija'] = ('Teretna vozila') + + # Artikal olx_link + artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i] + podaci["Olx_id"] = olx_id[i] + response_item = requests.get(artikal_link, headers=headers) + soup_item = BeautifulSoup(response_item.content, 'html.parser') + result_item = soup_item.find('div',{'class':'artikal_lijevo'}) + # print(artikal_link) + + + # Getting filters info from item + + # Osnovni filteri + + # Cijena + if (result_item.find('div',{'id':'pc'})): + x_cijena = result_item.find('div',{'id':'pc'}).findAll('p') + item_cijena = x_cijena[1].get_text().split()[0] + if(item_cijena == 'Po'): + item_cijena = "Po dogovoru" + podaci['Cijena'] = item_cijena + # print(podaci['Cijena']) + + # Lokacija + #kanton + if (result_item.find('div',{'class':'mobile-lokacija'})): + x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split() + item_kanton = x_lokacija[0].replace(',','') + podaci['Lokacija_kanton'] = item_kanton + # print(podaci['Lokacija_kanton']) + + #grad + x_lokacija.pop(0) + item_grad = x_lokacija + mojstring = ' '.join(item_grad) + podaci['Lokacija_grad'] = mojstring + # print(podaci['Lokacija_grad']) + + # Stanje + if (result_item.find('div',{'class':'mobile-stanje'})): + x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split() + item_stanje = x_stanje[1] + podaci['Stanje'] = item_stanje + # print(podaci['Stanje']) + + #-------------------------------------------------------------------------------------- + + # Brand + + if (result_item.find_all('div',{'itemprop':'brand'})): + x_brand = result_item.find('div',{'itemprop':'brand'}).find('a').get_text() + podaci['Brend'] = x_brand + # print(x_brand) + + + # Napredni filteri + + # Dodatna polja + if (result_item.find_all('div',{'id':'dodatnapolja1'})): + dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'}) + for i in range (0,len(dodatnapolja_all_divs)): + df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'}) + for j in range (0,len(df_pom)): + df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text() + if (df_pom[j].find('div',{'class','df2'}).find('i')): + df_pom2 = True + else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text() + podaci[df_pom1] = df_pom2 + + # KW single + # kw = podaci['Kilovata (KW)'].split()[0] + # podaci['Kilovata (KW)'] = kw + # print(podaci['Kilovata (KW)']) + + + + # Vrijeme i datum + if (result_item.find('time', {'class' : 'entry-date'})): + date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split() + datum = date_time_div[0] + vrijeme = date_time_div[2] + podaci["Datum"] = datum + podaci["Vrijeme"] = vrijeme + # print(podaci["Datum"], podaci["Vrijeme"]) + + + print('.....................................................') + + # Insert datas to database + dictionary_copy = podaci.copy() + podaci_db.append(dictionary_copy) + + + + +# ------------- CREATE PANDAS DATAFRAME - DICTIONARY -------------- + +olx_db = pd.DataFrame(podaci_db) # treba biti niz +# print(olx_db) + +olx_db.to_excel('test_teretna2.xlsx',index=False) + +print("Zavrseno!!") + + + + + + + + + + + + + + + + + +