Merge branch 'add_crawler_first_version' into 'main'

add crawler first version

See merge request saburly/marketalarm/kivi-za-auta!7
This commit was merged in pull request #7.
This commit is contained in:
msosic97
2022-05-09 07:08:32 +00:00
16 changed files with 3501 additions and 0 deletions

View File

@@ -0,0 +1,323 @@
# ----------Imports------------
from datetime import date
from traceback import print_tb
from unittest import result
from urllib import response
from urllib.request import Request
from warnings import filters
from xml.etree.ElementTree import tostring
from bs4 import BeautifulSoup
from matplotlib import dates
from numpy import diag_indices
import requests
import pandas as pd
import random
# List of User-Agent
user_agent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1',
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
]
user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)]
headers = {'User-Agent': user_agent}
# Array of object filteri
podaci_db = []
# Pagination cross webpages
# n is number of pages to crawl
pages_number_to_crawl = 2
for i in range(1,pages_number_to_crawl):
# if kategorija=18 - Automobili
main_website = 'https://www.olx.ba/pretraga?id=18&kategorija=18&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&kilometra-a_min=0&kilometra-a_max=0&stranica=' + str(i)
category_of_vehicle = 18
# Request to website
response_for_page = requests.get(main_website, headers=headers)
# Soup object
soup_page = BeautifulSoup(response_for_page.content, 'html.parser')
# Results
results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'})
olx_id = []
# All filters
filters = {
"Olx_id" : None,
"Kategorija" : None,
"Cijena" : None,
"Stanje" : None,
"Lokacija_kanton" : None,
"Lokacija_grad" : None,
"Proizvođač" : None,
"Model" : None,
"Godište" : None,
"Kilometraža" : None,
"Kilovata (KW)" : None,
"Kubikaža" : None,
"Gorivo" : None,
"Broj vrata" : None,
"Konjskih snaga" : None,
"Metalik" : None,
"Masa/Težina (kg)" : None,
"Tip" : None,
"Pogon" : None,
"Emisioni standard" : None,
"Veličina felgi" : None,
"Transmisija" : None,
"Broj stepeni prijenosa" : None,
"Boja" : None,
"Muzika / ozvučenje" : None,
"Parking senzori" : None,
"Parking kamera" : None,
"Registrovan do" : None,
"Godina prve registracije" : None,
"Broj prethodnih vlasnika" : None,
"Posjeduje gume" : None,
"Višezonska klima" : None,
"Rolo zavjese" : None,
"Svjetla" : None,
"Zaštita/Blokada" : None,
"Sjedećih mjesta" : None,
"Turbo" : None,
"DPF/FAP filter" : None,
"Strane tablice" : None,
"Ocarinjen" : None,
"Prilagođen invalidima" : None,
"Servo volan" : None,
"Tempomat" : None,
"ESP" : None,
"El. podizači stakala" : None,
"Senzor mrtvog ugla" : None,
"Digitalna klima" : None,
"Touch screen (ekran)" : None,
"Panorama krov" : None,
"Koža" : None,
"Masaža sjedišta" : None,
"El. pomjeranje sjedišta" : None,
"Senzor auto. svjetla" : None,
"Alarm" : None,
"Daljinsko otključavanje" : None,
"Auto kuka" : None,
"Udaren" : None,
"Start-Stop sistem" : None,
"Park assist" : None,
"Registrovan" : None,
"Na lizingu" : None,
"Servisna knjiga" : None,
"Komande na volanu" : None,
"ABS" : None,
"Airbag" : None,
"Električni retrovizori" : None,
"Klima" : None,
"Navigacija" : None,
"Šiber" : None,
"Naslon za ruku" : None,
"Hlađenje sjedišta" : None,
"Grijanje sjedišta" : None,
"Memorija sjedišta" : None,
"Alu felge" : None,
"Centralna brava" : None,
"Oldtimer" : None,
"ISOFIX" : None,
"Datum" : None,
"Vrijeme" : None
}
# Number of all items
broj_el = 0
# Getting all id's of articles
for i in range(0, len(results_all_items_per_page)):
if(results_all_items_per_page[i].find('p')):
# Divide id from rest of link
address_content = results_all_items_per_page[i].find('a')['href']
temp = address_content.split('/')
artikal_number = temp[4]
olx_id.append(artikal_number)
broj_el = broj_el + 1
for i in range(0, broj_el):
# New dictionary instance for every item
podaci = filters.copy()
# Add kategorija
if (category_of_vehicle == 18): podaci['Kategorija'] = ('Automobili')
for i in range(0, 1): #len(olx_id)
# Artikal olx_link
# artikal_link = 'https://www.olx.ba/artikal/' + '35731053'
artikal_link = 'https://www.olx.ba/artikal/' + '36976713'
podaci["Olx_id"] = artikal_link
response_item = requests.get(artikal_link, headers=headers)
soup_item = BeautifulSoup(response_item.content, 'html.parser')
result_item = soup_item.find('div',{'class':'artikal_lijevo'})
print(artikal_link)
# Getting filters info from item
# Osnovni filteri
# Cijena
if (result_item.find('div',{'id':'pc'})):
x_cijena = result_item.find('div',{'id':'pc'}).findAll('p')
item_cijena = x_cijena[1].get_text().split()[0]
if(item_cijena == 'Po'):
item_cijena = "Po dogovoru"
podaci['Cijena'] = item_cijena
print(podaci['Cijena'])
# Lokacija
#kanton
if (result_item.find('div',{'class':'mobile-lokacija'})):
x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split()
item_kanton = x_lokacija[0].replace(',','')
podaci['Lokacija_kanton'] = item_kanton
print(podaci['Lokacija_kanton'])
#grad
x_lokacija.pop(0)
item_grad = x_lokacija
mojstring = ' '.join(item_grad)
podaci['Lokacija_grad'] = mojstring
print(podaci['Lokacija_grad'])
# Stanje
if (result_item.find('div',{'class':'mobile-stanje'})):
x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split()
item_stanje = x_stanje[1]
podaci['Stanje'] = item_stanje
print(podaci['Stanje'])
# Ispod divs
if (result_item.find_all('div',{'class':'ispod'})):
ispod_divs = result_item.find_all('div',{'class':'ispod'})
for i in range (len(ispod_divs)):
# Proizvodjac
if (i==0):
x_proizvodjac_p = ispod_divs[0].find_all('p')
x_proizvodjac = x_proizvodjac_p[1].get_text()
podaci['Proizvođač'] = x_proizvodjac
print(podaci['Proizvođač'] )
# Model
if (i==1):
x_model_p = ispod_divs[1].find_all('p')
x_model = x_model_p[1].get_text()
podaci['Model'] = x_model
print(podaci['Model'])
# Godiste
if (i==2):
x_godiste_p = ispod_divs[2].find_all('p')
x_godiste = x_godiste_p[1].get_text()
podaci['Godište'] = x_godiste
print(podaci['Godište'])
# Kilometraza
if (i==3):
x_kilometraza_p = ispod_divs[3].find_all('p')
x_kilometraza = x_kilometraza_p[1].get_text()
podaci['Kilometraža'] = x_kilometraza
print(podaci['Kilometraža'])
# Gorivo
if (i==4):
x_gorivo_p = ispod_divs[4].find_all('p')
x_gorivo = x_gorivo_p[1].get_text()
podaci['Gorivo'] = x_gorivo
print(podaci['Gorivo'])
# Napredni filteri
# Dodatna polja
if (result_item.find_all('div',{'id':'dodatnapolja1'})):
dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'})
for i in range (0,len(dodatnapolja_all_divs)):
df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'})
for j in range (0,len(df_pom)):
df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text()
if (df_pom[j].find('div',{'class','df2'}).find('i')):
df_pom2 = True
else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text()
podaci[df_pom1] = df_pom2
# KW
kw = podaci['Kilovata (KW)'].split()[0]
# print(kw)
# Vrijeme i datum
if (result_item.find('time', {'class' : 'entry-date'})):
date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split()
datum = date_time_div[0]
vrijeme = date_time_div[2]
podaci["Datum"] = datum
podaci["Vrijeme"] = vrijeme
# print(podaci["Datum"], podaci["Vrijeme"])
# Vrsta oglasa
dictionary_copy = podaci.copy()
podaci_db.append(dictionary_copy)
# ------------- CREATE PANDAS DATAFRAME - DICTIONARY --------------
# olx_db = pd.DataFrame(podaci_db) # treba biti niz
# # print(olx_db)
# olx_db.to_excel('proba2.xlsx',index=False)
# ----------------------- End of Code -------------------------
# print(len(podaci_db))
# print(podaci)
# print(podaci.keys())
# print(broj_el)

View File

@@ -0,0 +1,304 @@
# ----------Imports------------
from datetime import date
from traceback import print_tb
from unittest import result
from urllib import response
from urllib.request import Request
from bs4 import BeautifulSoup
from matplotlib import dates
from numpy import diag_indices
import requests
import pandas as pd
import random
# List of User-Agent
user_agent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1',
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
]
user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)]
headers = {'User-Agent': user_agent}
# Array of object filteri
podaci_db = []
# Pagination cross webpages
# n is number of pages to crawl
pages_number_to_crawl = 2
for i in range(1,pages_number_to_crawl):
# if kategorija=18 - Automobili
main_website = 'https://www.olx.ba/pretraga?id=18&kategorija=18&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&kilometra-a_min=0&kilometra-a_max=0&stranica=' + str(i)
category_of_vehicle = 18
# Request to website
response_for_page = requests.get(main_website, headers=headers)
# Soup object
soup_page = BeautifulSoup(response_for_page.content, 'html.parser')
# Results
results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'})
# List of olx id
olx_id = []
# All filters
filters = {
"Olx_id" : None,
"Kategorija" : None,
"Cijena" : None,
"Stanje" : None,
"Lokacija_kanton" : None,
"Lokacija_grad" : None,
"Proizvođač" : None,
"Model" : None,
"Godište" : None,
"Kilometraža" : None,
"Kilovata (KW)" : None,
"Kubikaža" : None,
"Gorivo" : None,
"Broj vrata" : None,
"Konjskih snaga" : None,
"Metalik" : None,
"Masa/Težina (kg)" : None,
"Tip" : None,
"Pogon" : None,
"Emisioni standard" : None,
"Veličina felgi" : None,
"Transmisija" : None,
"Broj stepeni prijenosa" : None,
"Boja" : None,
"Muzika / ozvučenje" : None,
"Parking senzori" : None,
"Parking kamera" : None,
"Registrovan do" : None,
"Godina prve registracije" : None,
"Broj prethodnih vlasnika" : None,
"Posjeduje gume" : None,
"Višezonska klima" : None,
"Rolo zavjese" : None,
"Svjetla" : None,
"Zaštita/Blokada" : None,
"Sjedećih mjesta" : None,
"Turbo" : None,
"DPF/FAP filter" : None,
"Strane tablice" : None,
"Ocarinjen" : None,
"Prilagođen invalidima" : None,
"Servo volan" : None,
"Tempomat" : None,
"ESP" : None,
"El. podizači stakala" : None,
"Senzor mrtvog ugla" : None,
"Digitalna klima" : None,
"Touch screen (ekran)" : None,
"Panorama krov" : None,
"Koža" : None,
"Masaža sjedišta" : None,
"El. pomjeranje sjedišta" : None,
"Senzor auto. svjetla" : None,
"Alarm" : None,
"Daljinsko otključavanje" : None,
"Auto kuka" : None,
"Udaren" : None,
"Start-Stop sistem" : None,
"Park assist" : None,
"Registrovan" : None,
"Na lizingu" : None,
"Servisna knjiga" : None,
"Komande na volanu" : None,
"ABS" : None,
"Airbag" : None,
"Električni retrovizori" : None,
"Klima" : None,
"Navigacija" : None,
"Šiber" : None,
"Naslon za ruku" : None,
"Hlađenje sjedišta" : None,
"Grijanje sjedišta" : None,
"Memorija sjedišta" : None,
"Alu felge" : None,
"Centralna brava" : None,
"Oldtimer" : None,
"ISOFIX" : None,
"Datum" : None,
"Vrijeme" : None
}
# Number of all items
broj_el = 0
## Getting all id's of articles
for i in range(0, len(results_all_items_per_page)):
if(results_all_items_per_page[i].find('p')):
# Divide id from rest of link
address_content = results_all_items_per_page[i].find('a')['href']
temp = address_content.split('/')
artikal_number = temp[4]
olx_id.append(artikal_number)
broj_el = broj_el + 1
for i in range(0, broj_el):
# New dictionary instance for every item
podaci = filters.copy()
# Add kategorija
if (category_of_vehicle == 18): podaci['Kategorija'] = ('Automobili')
# Artikal olx_link
artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i]
podaci["Olx_id"] = olx_id[i]
response_item = requests.get(artikal_link, headers=headers)
soup_item = BeautifulSoup(response_item.content, 'html.parser')
result_item = soup_item.find('div',{'class':'artikal_lijevo'})
# print(artikal_link)
# Getting filters info from item
# Osnovni filteri
# Cijena
if (result_item.find('div',{'id':'pc'})):
x_cijena = result_item.find('div',{'id':'pc'}).findAll('p')
item_cijena = x_cijena[1].get_text().split()[0]
if(item_cijena == 'Po'):
item_cijena = "Po dogovoru"
podaci['Cijena'] = item_cijena
# print(podaci['Cijena'])
# Lokacija
#kanton
if (result_item.find('div',{'class':'mobile-lokacija'})):
x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split()
item_kanton = x_lokacija[0].replace(',','')
podaci['Lokacija_kanton'] = item_kanton
# print(podaci['Lokacija_kanton'])
#grad
x_lokacija.pop(0)
item_grad = x_lokacija
mojstring = ' '.join(item_grad)
podaci['Lokacija_grad'] = mojstring
# print(podaci['Lokacija_grad'])
# Stanje
if (result_item.find('div',{'class':'mobile-stanje'})):
x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split()
item_stanje = x_stanje[1]
podaci['Stanje'] = item_stanje
# print(podaci['Stanje'])
# Ispod divs
if (result_item.find_all('div',{'class':'ispod'})):
ispod_divs = result_item.find_all('div',{'class':'ispod'})
for i in range (len(ispod_divs)):
# Proizvodjac
if (i==0):
x_proizvodjac_p = ispod_divs[0].find_all('p')
x_proizvodjac = x_proizvodjac_p[1].get_text()
podaci['Proizvođač'] = x_proizvodjac
# print(podaci['Proizvođač'])
# Model
if (i==1):
x_model_p = ispod_divs[1].find_all('p')
x_model = x_model_p[1].get_text()
podaci['Model'] = x_model
# print(podaci['Model'])
# Godiste
if (i==2):
x_godiste_p = ispod_divs[2].find_all('p')
x_godiste = x_godiste_p[1].get_text()
podaci['Godište'] = x_godiste
# print(podaci['Godište'])
# Kilometraza
if (i==3):
x_kilometraza_p = ispod_divs[3].find_all('p')
x_kilometraza = x_kilometraza_p[1].get_text()
podaci['Kilometraža'] = x_kilometraza
# print(podaci['Kilometraža'])
# Gorivo
if (i==4):
x_gorivo_p = ispod_divs[4].find_all('p')
x_gorivo = x_gorivo_p[1].get_text()
podaci['Gorivo'] = x_gorivo
# print(podaci['Gorivo'])
# Napredni filteri
# Dodatna polja
if (result_item.find_all('div',{'id':'dodatnapolja1'})):
dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'})
for i in range (0,len(dodatnapolja_all_divs)):
df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'})
for j in range (0,len(df_pom)):
df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text()
if (df_pom[j].find('div',{'class','df2'}).find('i')):
df_pom2 = True
else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text()
podaci[df_pom1] = df_pom2
# KW single
kw = podaci['Kilovata (KW)'].split()[0]
podaci['Kilovata (KW)'] = kw
# print(kw)
# Vrijeme i datum
if (result_item.find('time', {'class' : 'entry-date'})):
date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split()
datum = date_time_div[0]
vrijeme = date_time_div[2]
podaci["Datum"] = datum
podaci["Vrijeme"] = vrijeme
# print(podaci["Datum"], podaci["Vrijeme"])
# Insert datas to database
dictionary_copy = podaci.copy()
podaci_db.append(dictionary_copy)
# ------------- CREATE PANDAS DATAFRAME - DICTIONARY --------------
olx_db = pd.DataFrame(podaci_db) # treba biti niz
# print(olx_db)
olx_db.to_excel('proba2.xlsx',index=False)
print("Zavrseno!!")

View File

@@ -0,0 +1,313 @@
# ----------Imports------------
from datetime import date
from traceback import print_tb
from unittest import result
from urllib import response
from urllib.request import Request
from bs4 import BeautifulSoup
from matplotlib import dates
from numpy import diag_indices
import requests
import pandas as pd
import random
# List of User-Agent
user_agent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1',
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
]
user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)]
headers = {'User-Agent': user_agent}
# Vrsta pretraga (vrsta=*):
# Sve pretrage link: https://www.olx.ba/pretraga?id=18&kategorija=18&sort_order=desc&sort_po=datum
# Samo prodaja link: https://www.olx.ba/pretraga?id=18&kategorija=18&sort_order=desc&sort_po=datum&vrsta=samoprodaja
# Samo potraznja link: https://www.olx.ba/pretraga?id=18&kategorija=18&sort_order=desc&sort_po=datum&vrsta=samopotraznja
# Sa zamjenom (sazamjenom=sazamjenom)
#https://www.olx.ba/pretraga?id=18&kategorija=18&stanje=0&sort_order=desc&sort_po=datum&sazamjenom=sazamjenom
# Array of object filteri
podaci_db = []
# Pagination cross webpages
# n is number of pages to crawl
pages_number_to_crawl = 2
for i in range(1,pages_number_to_crawl):
# if kategorija=18 - Automobili
main_website = 'https://www.olx.ba/pretraga?id=18&kategorija=18&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&kilometra-a_min=0&kilometra-a_max=0&stranica=' + str(i)
category_of_vehicle = 18
# Request to website
response_for_page = requests.get(main_website, headers=headers)
# Soup object
soup_page = BeautifulSoup(response_for_page.content, 'html.parser')
# Results
results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'})
# List of olx id
olx_id = []
# All filters
filters = {
"Olx_id" : None,
"Vrsta_oglasa" : None,
"Kategorija" : None,
"Cijena" : None,
"Stanje" : None,
"Lokacija_kanton" : None,
"Lokacija_grad" : None,
"Proizvođač" : None,
"Model" : None,
"Godište" : None,
"Kilometraža" : None,
"Kilovata (KW)" : None,
"Kubikaža" : None,
"Gorivo" : None,
"Broj vrata" : None,
"Konjskih snaga" : None,
"Metalik" : None,
"Masa/Težina (kg)" : None,
"Tip" : None,
"Pogon" : None,
"Emisioni standard" : None,
"Veličina felgi" : None,
"Transmisija" : None,
"Broj stepeni prijenosa" : None,
"Boja" : None,
"Muzika / ozvučenje" : None,
"Parking senzori" : None,
"Parking kamera" : None,
"Registrovan do" : None,
"Godina prve registracije" : None,
"Broj prethodnih vlasnika" : None,
"Posjeduje gume" : None,
"Višezonska klima" : None,
"Rolo zavjese" : None,
"Svjetla" : None,
"Zaštita/Blokada" : None,
"Sjedećih mjesta" : None,
"Turbo" : None,
"DPF/FAP filter" : None,
"Strane tablice" : None,
"Ocarinjen" : None,
"Prilagođen invalidima" : None,
"Servo volan" : None,
"Tempomat" : None,
"ESP" : None,
"El. podizači stakala" : None,
"Senzor mrtvog ugla" : None,
"Digitalna klima" : None,
"Touch screen (ekran)" : None,
"Panorama krov" : None,
"Koža" : None,
"Masaža sjedišta" : None,
"El. pomjeranje sjedišta" : None,
"Senzor auto. svjetla" : None,
"Alarm" : None,
"Daljinsko otključavanje" : None,
"Auto kuka" : None,
"Udaren" : None,
"Start-Stop sistem" : None,
"Park assist" : None,
"Registrovan" : None,
"Na lizingu" : None,
"Servisna knjiga" : None,
"Komande na volanu" : None,
"ABS" : None,
"Airbag" : None,
"Električni retrovizori" : None,
"Klima" : None,
"Navigacija" : None,
"Šiber" : None,
"Naslon za ruku" : None,
"Hlađenje sjedišta" : None,
"Grijanje sjedišta" : None,
"Memorija sjedišta" : None,
"Alu felge" : None,
"Centralna brava" : None,
"Oldtimer" : None,
"ISOFIX" : None,
"Datum" : None,
"Vrijeme" : None
}
# Number of all items
broj_el = 0
# Getting all id's of articles
for i in range(0, len(results_all_items_per_page)):
if(results_all_items_per_page[i].find('p')):
# Divide id from rest of link
address_content = results_all_items_per_page[i].find('a')['href']
temp = address_content.split('/')
artikal_number = temp[4]
olx_id.append(artikal_number)
broj_el = broj_el + 1
for i in range(0, broj_el):
# New dictionary instance for every item
podaci = filters.copy()
# Add kategorija
if (category_of_vehicle == 884): podaci['Kategorija'] = ('Prikolice')
# Artikal olx_link
artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i]
podaci["Olx_id"] = olx_id[i]
response_item = requests.get(artikal_link, headers=headers)
soup_item = BeautifulSoup(response_item.content, 'html.parser')
result_item = soup_item.find('div',{'class':'artikal_lijevo'})
# print(artikal_link)
# Getting filters info from item
# Osnovni filteri
# Cijena
if (result_item.find('div',{'id':'pc'})):
x_cijena = result_item.find('div',{'id':'pc'}).findAll('p')
item_cijena = x_cijena[1].get_text().split()[0]
if(item_cijena == 'Po'):
item_cijena = "Po dogovoru"
podaci['Cijena'] = item_cijena
print(podaci['Cijena'])
# Lokacija
#kanton
if (result_item.find('div',{'class':'mobile-lokacija'})):
x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split()
item_kanton = x_lokacija[0].replace(',','')
podaci['Lokacija_kanton'] = item_kanton
print(podaci['Lokacija_kanton'])
#grad
x_lokacija.pop(0)
item_grad = x_lokacija
mojstring = ' '.join(item_grad)
podaci['Lokacija_grad'] = mojstring
print(podaci['Lokacija_grad'])
# Stanje
if (result_item.find('div',{'class':'mobile-stanje'})):
x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split()
item_stanje = x_stanje[1]
podaci['Stanje'] = item_stanje
print(podaci['Stanje'])
# Ispod divs
if (result_item.find_all('div',{'class':'ispod'})):
ispod_divs = result_item.find_all('div',{'class':'ispod'})
for i in range (len(ispod_divs)):
# Proizvodjac
if (i==0):
x_proizvodjac_p = ispod_divs[0].find_all('p')
x_proizvodjac = x_proizvodjac_p[1].get_text()
podaci['Proizvođač'] = x_proizvodjac
print(podaci['Proizvođač'] )
# Model
if (i==1):
x_model_p = ispod_divs[1].find_all('p')
x_model = x_model_p[1].get_text()
podaci['Model'] = x_model
print(podaci['Model'])
# Godiste
if (i==2):
x_godiste_p = ispod_divs[2].find_all('p')
x_godiste = x_godiste_p[1].get_text()
podaci['Godište'] = x_godiste
print(podaci['Godište'])
# Kilometraza
if (i==3):
x_kilometraza_p = ispod_divs[3].find_all('p')
x_kilometraza = x_kilometraza_p[1].get_text()
podaci['Kilometraža'] = x_kilometraza
print(podaci['Kilometraža'])
# Gorivo
if (i==4):
x_gorivo_p = ispod_divs[4].find_all('p')
x_gorivo = x_gorivo_p[1].get_text()
podaci['Gorivo'] = x_gorivo
print(podaci['Gorivo'])
# Napredni filteri
# Dodatna polja
if (result_item.find_all('div',{'id':'dodatnapolja1'})):
dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'})
for i in range (0,len(dodatnapolja_all_divs)):
df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'})
for j in range (0,len(df_pom)):
df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text()
if (df_pom[j].find('div',{'class','df2'}).find('i')):
df_pom2 = True
else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text()
podaci[df_pom1] = df_pom2
# KW single
kw = podaci['Kilovata (KW)'].split()[0]
podaci['Kilovata (KW)'] = kw
print(kw)
# Vrijeme i datum
if (result_item.find('time', {'class' : 'entry-date'})):
date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split()
datum = date_time_div[0]
vrijeme = date_time_div[2]
podaci["Datum"] = datum
podaci["Vrijeme"] = vrijeme
# print(podaci["Datum"], podaci["Vrijeme"])
# Insert datas to database
dictionary_copy = podaci.copy()
podaci_db.append(dictionary_copy)
print('--------------------------------------------------------------------')
# ------------- CREATE PANDAS DATAFRAME - DICTIONARY --------------
# olx_db = pd.DataFrame(podaci_db) # treba biti niz
# # print(olx_db)
# olx_db.to_excel('probatest1.xlsx',index=False)

View File

@@ -0,0 +1,177 @@
from datetime import date
from bs4 import BeautifulSoup
from urllib import response
from urllib import request
import requests
import pandas as pd
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
headers = {'User-Agent': user_agent}
# Array of object filteri
podaci_db = []
# Pagination cross webpages
# n is number of pages to crawl
pages_number_to_crawl = 2
for i in range(1,pages_number_to_crawl):
# if kategorija=18 - Automobili
main_website = 'https://www.olx.ba/pretraga?kategorija=22&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i)
category_of_vehicle = 22
# Request to website
response_for_page = requests.get(main_website, headers=headers)
# Soup object
soup_page = BeautifulSoup(response_for_page.content, 'html.parser')
# Results
results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'})
# List of olx id
olx_id = []
# All filters
filters = {
"Olx_id" : None,
"Kategorija" : None,
"Cijena" : None,
"Stanje" : None,
"Lokacija_kanton" : None,
"Lokacija_grad" : None,
"Brend" : None,
"Broj brzina" : None,
"Tip" : None,
"Spol" : None,
"Masa" : None,
"Veličina rama" : None,
"Veličina točkova (inch)" : None,
"Godište" : None,
"Model" : None,
"Dječije" : None,
"Zadnji amortizer" : None,
"Disk kočnice" : None,
"Amortizer na sicu" : None,
"Svjetla/Signalizacija" : None,
"Gepek" : None,
"Prednji amortizer" : None,
"Nosač za vodu" : None,
"Datum" : None,
"Vrijeme" : None
}
# Number of all items
broj_el = 0
# Getting all id's of articles
for i in range(0, len(results_all_items_per_page)):
if(results_all_items_per_page[i].find('p')):
# Divide id from rest of link
address_content = results_all_items_per_page[i].find('a')['href']
temp = address_content.split('/')
artikal_number = temp[4]
olx_id.append(artikal_number)
broj_el = broj_el + 1
for i in range(0, broj_el):
podaci = filters.copy()
# Add kategorija
if (category_of_vehicle == 22): podaci['Kategorija'] = ('Bicikli')
# Artikal olx_link
artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i]
podaci["Olx_id"] = olx_id[i]
response_item = requests.get(artikal_link, headers=headers)
soup_item = BeautifulSoup(response_item.content, 'html.parser')
result_item = soup_item.find('div',{'class':'artikal_lijevo'})
# print(artikal_link)
# Osnovni filteri
# Cijena
if (result_item.find('div',{'id':'pc'})):
x_cijena = result_item.find('div',{'id':'pc'}).findAll('p')
item_cijena = x_cijena[1].get_text().split()[0]
if(item_cijena == 'Po'):
item_cijena = "Po dogovoru"
podaci['Cijena'] = item_cijena
# print(podaci['Cijena'])
# Lokacija
#kanton
if (result_item.find('div',{'class':'mobile-lokacija'})):
x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split()
item_kanton = x_lokacija[0].replace(',','')
podaci['Lokacija_kanton'] = item_kanton
# print(podaci['Lokacija_kanton'])
#grad
x_lokacija.pop(0)
item_grad = x_lokacija
mojstring = ' '.join(item_grad)
podaci['Lokacija_grad'] = mojstring
# print(podaci['Lokacija_grad'])
# Brand
if (result_item.find('div',{'itemprop':'brand'})):
x_brend = result_item.find('div',{'itemprop':'brand'}).find('a').get_text()
podaci['Brend'] = x_brend
# print(podaci['Brend'])
# Stanje
if (result_item.find('div',{'class':'mobile-stanje'})):
x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split()
item_stanje = x_stanje[1]
podaci['Stanje'] = item_stanje
# print(podaci['Stanje'])
# Dodatna polja
if (result_item.find_all('div',{'id':'dodatnapolja1'})):
dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'})
for i in range (0,len(dodatnapolja_all_divs)):
df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'})
for j in range (0,len(df_pom)):
df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text()
if (df_pom[j].find('div',{'class','df2'}).find('i')):
df_pom2 = True
else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text()
podaci[df_pom1] = df_pom2
#print(df_pom1 + ' : ' + str(df_pom2))
# Vrijeme i datum
if (result_item.find('time', {'class' : 'entry-date'})):
date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split()
datum = date_time_div[0]
vrijeme = date_time_div[2]
podaci["Datum"] = datum
podaci["Vrijeme"] = vrijeme
# print(podaci["Datum"], podaci["Vrijeme"])
# Insert datas to database
dictionary_copy = podaci.copy()
podaci_db.append(dictionary_copy)
# ------------- CREATE PANDAS DATAFRAME - DICTIONARY --------------
olx_db = pd.DataFrame(podaci_db) # treba biti niz
# print(olx_db)
olx_db.to_excel('proba-bicikli.xlsx',index=False)

View File

@@ -0,0 +1,185 @@
from datetime import date
from bs4 import BeautifulSoup
from urllib import response
from urllib import request
import requests
import pandas as pd
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
headers = {'User-Agent': user_agent}
# Array of object filteri
podaci_db = []
# Pagination cross webpages
# n is number of pages to crawl
pages_number_to_crawl = 4
for i in range(1,pages_number_to_crawl):
# if kategorija=18 - Automobili
main_website = 'https://www.olx.ba/pretraga?kategorija=1040&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i)
category_of_vehicle = 21
# Request to website
response_for_page = requests.get(main_website, headers=headers)
# Soup object
soup_page = BeautifulSoup(response_for_page.content, 'html.parser')
# Results
results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'})
# List of olx id
olx_id = []
# All filters
filters = {
"Olx_id" : None,
"Kategorija" : None,
"Cijena" : None,
"Stanje" : None,
"Lokacija_kanton" : None,
"Lokacija_grad" : None,
"Brend" : None,
"Kilometraža" : None,
"Gorivo" : None,
"Tip" : None,
"Godište" : None,
"Kilovata (KW)" : None,
"Konjskih snaga" : None,
"Kubikaža" : None,
"Boja" : None,
"Muzika / ozvučenje" : None,
"Transmisija" : None,
"Model" : None,
"El. podizači stakala" : None,
"Električni retrovizori" : None,
"Klima" : None,
"Metalik" : None,
"Navigacija" : None,
"Ocarinjen" : None,
"Registrovan" : None,
"Servisna knjiga" : None,
"Udaren" : None,
"Xenon svjetla" : None,
"Datum" : None,
"Vrijeme" : None
}
# Number of all items
broj_el = 0
# Getting all id's of articles
for i in range(0, len(results_all_items_per_page)):
if(results_all_items_per_page[i].find('p')):
# Divide id from rest of link
address_content = results_all_items_per_page[i].find('a')['href']
temp = address_content.split('/')
artikal_number = temp[4]
olx_id.append(artikal_number)
broj_el = broj_el + 1
for i in range(0, 1):
podaci = filters.copy()
# Add kategorija
if (category_of_vehicle == 1040): podaci['Kategorija'] = ('Autobusi i minibusi')
# Artikal olx_link
artikal_link = 'https://www.olx.ba/artikal/' + '46785631'
podaci["Olx_id"] = '46785631'
response_item = requests.get(artikal_link, headers=headers)
soup_item = BeautifulSoup(response_item.content, 'html.parser')
result_item = soup_item.find('div',{'class':'artikal_lijevo'})
print(artikal_link)
# Getting filters info from item
# Osnovni filteri
# Cijena
if (result_item.find('div',{'id':'pc'})):
x_cijena = result_item.find('div',{'id':'pc'}).findAll('p')
item_cijena = x_cijena[1].get_text().split()[0]
if(item_cijena == 'Po'):
item_cijena = "Po dogovoru"
podaci['Cijena'] = item_cijena
print(podaci['Cijena'])
# Lokacija
#kanton
if (result_item.find('div',{'class':'mobile-lokacija'})):
x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split()
item_kanton = x_lokacija[0].replace(',','')
podaci['Lokacija_kanton'] = item_kanton
print(podaci['Lokacija_kanton'])
#grad
x_lokacija.pop(0)
item_grad = x_lokacija
mojstring = ' '.join(item_grad)
podaci['Lokacija_grad'] = mojstring
print(podaci['Lokacija_grad'])
# Brand
if (result_item.find('div',{'itemprop':'brand'})):
x_brend = result_item.find('div',{'itemprop':'brand'}).find('a').get_text()
podaci['Brend'] = x_brend
# Stanje
if (result_item.find('div',{'class':'mobile-stanje'})):
x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split()
item_stanje = x_stanje[1]
podaci['Stanje'] = item_stanje
print(podaci['Stanje'])
# Dodatna polja
if (result_item.find_all('div',{'id':'dodatnapolja1'})):
dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'})
for i in range (0,len(dodatnapolja_all_divs)):
df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'})
for j in range (0,len(df_pom)):
df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text()
if (df_pom[j].find('div',{'class','df2'}).find('i')):
df_pom2 = True
else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text()
podaci[df_pom1] = df_pom2
print(df_pom1 + ' : ' + str(df_pom2))
# Vrijeme i datum
if (result_item.find('time', {'class' : 'entry-date'})):
date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split()
datum = date_time_div[0]
vrijeme = date_time_div[2]
podaci["Datum"] = datum
podaci["Vrijeme"] = vrijeme
# print(podaci["Datum"], podaci["Vrijeme"])
# Insert datas to database
dictionary_copy = podaci.copy()
podaci_db.append(dictionary_copy)
# ------------- CREATE PANDAS DATAFRAME - DICTIONARY --------------
# olx_db = pd.DataFrame(podaci_db) # treba biti niz
# # print(olx_db)
# olx_db.to_excel('test-autobusi.xlsx',index=False)

View File

@@ -0,0 +1,178 @@
from datetime import date
from bs4 import BeautifulSoup
from urllib import response
from urllib import request
import requests
import pandas as pd
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
headers = {'User-Agent': user_agent}
# Array of object filteri
podaci_db = []
# Pagination cross webpages
# n is number of pages to crawl
pages_number_to_crawl = 2
for i in range(1,pages_number_to_crawl):
# if kategorija=18 - Automobili
main_website = 'https://www.olx.ba/pretraga?kategorija=1040&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i)
category_of_vehicle = 1040
# Request to website
response_for_page = requests.get(main_website, headers=headers)
# Soup object
soup_page = BeautifulSoup(response_for_page.content, 'html.parser')
# Results
results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'})
# List of olx id
olx_id = []
# All filters
filters = {
"Olx_id" : None,
"Kategorija" : None,
"Cijena" : None,
"Stanje" : None,
"Lokacija_kanton" : None,
"Lokacija_grad" : None,
"Brend" : None,
"Kilometraža" : None,
"Gorivo" : None,
"Tip" : None,
"Godište" : None,
"Kilovata (KW)" : None,
"Konjskih snaga" : None,
"Kubikaža" : None,
"Boja" : None,
"Muzika / ozvučenje" : None,
"Transmisija" : None,
"Model" : None,
"El. podizači stakala" : None,
"Električni retrovizori" : None,
"Klima" : None,
"Metalik" : None,
"Navigacija" : None,
"Ocarinjen" : None,
"Registrovan" : None,
"Servisna knjiga" : None,
"Udaren" : None,
"Xenon svjetla" : None,
"Datum" : None,
"Vrijeme" : None
}
# Number of all items
broj_el = 0
# Getting all id's of articles
for i in range(0, len(results_all_items_per_page)):
if(results_all_items_per_page[i].find('p')):
# Divide id from rest of link
address_content = results_all_items_per_page[i].find('a')['href']
temp = address_content.split('/')
artikal_number = temp[4]
olx_id.append(artikal_number)
broj_el = broj_el + 1
for i in range(0, broj_el):
podaci = filters.copy()
# Add kategorija
if (category_of_vehicle == 1040): podaci['Kategorija'] = ('Autobusi i minibusi')
# Artikal olx_link
artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i]
podaci["Olx_id"] = olx_id[i]
response_item = requests.get(artikal_link, headers=headers)
soup_item = BeautifulSoup(response_item.content, 'html.parser')
result_item = soup_item.find('div',{'class':'artikal_lijevo'})
# Getting filters info from item
# Osnovni filteri
# Cijena
if (result_item.find('div',{'id':'pc'})):
x_cijena = result_item.find('div',{'id':'pc'}).findAll('p')
item_cijena = x_cijena[1].get_text().split()[0]
if(item_cijena == 'Po'):
item_cijena = "Po dogovoru"
podaci['Cijena'] = item_cijena
# Lokacija
#kanton
if (result_item.find('div',{'class':'mobile-lokacija'})):
x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split()
item_kanton = x_lokacija[0].replace(',','')
podaci['Lokacija_kanton'] = item_kanton
#grad
x_lokacija.pop(0)
item_grad = x_lokacija
mojstring = ' '.join(item_grad)
podaci['Lokacija_grad'] = mojstring
# Brand
if (result_item.find('div',{'itemprop':'brand'})):
x_brend = result_item.find('div',{'itemprop':'brand'}).find('a').get_text()
podaci['Brend'] = x_brend
# Stanje
if (result_item.find('div',{'class':'mobile-stanje'})):
x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split()
item_stanje = x_stanje[1]
podaci['Stanje'] = item_stanje
# Dodatna polja
if (result_item.find_all('div',{'id':'dodatnapolja1'})):
dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'})
for i in range (0,len(dodatnapolja_all_divs)):
df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'})
for j in range (0,len(df_pom)):
df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text()
if (df_pom[j].find('div',{'class','df2'}).find('i')):
df_pom2 = True
else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text()
podaci[df_pom1] = df_pom2
# Vrijeme i datum
if (result_item.find('time', {'class' : 'entry-date'})):
date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split()
datum = date_time_div[0]
vrijeme = date_time_div[2]
podaci["Datum"] = datum
podaci["Vrijeme"] = vrijeme
# Insert datas to database
dictionary_copy = podaci.copy()
podaci_db.append(dictionary_copy)
# ------------- CREATE PANDAS DATAFRAME - DICTIONARY --------------
olx_db = pd.DataFrame(podaci_db) # treba biti niz
# print(olx_db)
olx_db.to_excel('test-autobusi.xlsx',index=False)

View File

@@ -0,0 +1,185 @@
from datetime import date
from bs4 import BeautifulSoup
from urllib import response
from urllib import request
import requests
import pandas as pd
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
headers = {'User-Agent': user_agent}
# Array of object filteri
podaci_db = []
# Pagination cross webpages
# n is number of pages to crawl
pages_number_to_crawl = 2
for i in range(1,pages_number_to_crawl):
# if kategorija=18 - Automobili
main_website = 'https://www.olx.ba/pretraga?kategorija=1040&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i)
category_of_vehicle = 1040
# Request to website
response_for_page = requests.get(main_website, headers=headers)
# Soup object
soup_page = BeautifulSoup(response_for_page.content, 'html.parser')
# Results
results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'})
# List of olx id
olx_id = []
# All filters
filters = {
"Olx_id" : None,
"Kategorija" : None,
"Cijena" : None,
"Stanje" : None,
"Lokacija_kanton" : None,
"Lokacija_grad" : None,
"Brend" : None,
"Kilometraža" : None,
"Gorivo" : None,
"Tip" : None,
"Godište" : None,
"Kilovata (KW)" : None,
"Konjskih snaga" : None,
"Kubikaža" : None,
"Boja" : None,
"Muzika / ozvučenje" : None,
"Transmisija" : None,
"Model" : None,
"El. podizači stakala" : None,
"Električni retrovizori" : None,
"Klima" : None,
"Metalik" : None,
"Navigacija" : None,
"Ocarinjen" : None,
"Registrovan" : None,
"Servisna knjiga" : None,
"Udaren" : None,
"Xenon svjetla" : None,
"Datum" : None,
"Vrijeme" : None
}
# Number of all items
broj_el = 0
# Getting all id's of articles
for i in range(0, len(results_all_items_per_page)):
if(results_all_items_per_page[i].find('p')):
# Divide id from rest of link
address_content = results_all_items_per_page[i].find('a')['href']
temp = address_content.split('/')
artikal_number = temp[4]
olx_id.append(artikal_number)
broj_el = broj_el + 1
for i in range(0, broj_el):
podaci = filters.copy()
# Add kategorija
if (category_of_vehicle == 1040): podaci['Kategorija'] = ('Autobusi i minibusi')
# Artikal olx_link
artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i]
podaci["Olx_id"] = olx_id[i]
response_item = requests.get(artikal_link, headers=headers)
soup_item = BeautifulSoup(response_item.content, 'html.parser')
result_item = soup_item.find('div',{'class':'artikal_lijevo'})
print(artikal_link)
# Getting filters info from item
# Osnovni filteri
# Cijena
if (result_item.find('div',{'id':'pc'})):
x_cijena = result_item.find('div',{'id':'pc'}).findAll('p')
item_cijena = x_cijena[1].get_text().split()[0]
if(item_cijena == 'Po'):
item_cijena = "Po dogovoru"
podaci['Cijena'] = item_cijena
print(podaci['Cijena'])
# Lokacija
#kanton
if (result_item.find('div',{'class':'mobile-lokacija'})):
x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split()
item_kanton = x_lokacija[0].replace(',','')
podaci['Lokacija_kanton'] = item_kanton
print(podaci['Lokacija_kanton'])
#grad
x_lokacija.pop(0)
item_grad = x_lokacija
mojstring = ' '.join(item_grad)
podaci['Lokacija_grad'] = mojstring
print(podaci['Lokacija_grad'])
# Brand
if (result_item.find('div',{'itemprop':'brand'})):
x_brend = result_item.find('div',{'itemprop':'brand'}).find('a').get_text()
podaci['Brend'] = x_brend
# Stanje
if (result_item.find('div',{'class':'mobile-stanje'})):
x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split()
item_stanje = x_stanje[1]
podaci['Stanje'] = item_stanje
print(podaci['Stanje'])
# Dodatna polja
if (result_item.find_all('div',{'id':'dodatnapolja1'})):
dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'})
for i in range (0,len(dodatnapolja_all_divs)):
df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'})
for j in range (0,len(df_pom)):
df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text()
if (df_pom[j].find('div',{'class','df2'}).find('i')):
df_pom2 = True
else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text()
podaci[df_pom1] = df_pom2
#print(df_pom1 + ' : ' + str(df_pom2))
# Vrijeme i datum
if (result_item.find('time', {'class' : 'entry-date'})):
date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split()
datum = date_time_div[0]
vrijeme = date_time_div[2]
podaci["Datum"] = datum
podaci["Vrijeme"] = vrijeme
# print(podaci["Datum"], podaci["Vrijeme"])
# Insert datas to database
dictionary_copy = podaci.copy()
podaci_db.append(dictionary_copy)
# ------------- CREATE PANDAS DATAFRAME - DICTIONARY --------------
olx_db = pd.DataFrame(podaci_db) # treba biti niz
# print(olx_db)
olx_db.to_excel('test-autobusi.xlsx',index=False)

View File

@@ -0,0 +1,221 @@
# ----------Imports------------
from datetime import date
from traceback import print_tb
from unittest import result
from urllib import response
from urllib.request import Request
from bs4 import BeautifulSoup
from matplotlib import dates
from numpy import diag_indices
import requests
import pandas as pd
import random
# List of User-Agent
user_agent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1',
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
]
user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)]
headers = {'User-Agent': user_agent}
# Array of object filteri
podaci_db = []
# Pagination cross webpages
# n is number of pages to crawl
pages_number_to_crawl = 2
for i in range(1,pages_number_to_crawl):
# if kategorija=18 - Automobili
main_website = 'https://www.olx.ba/pretraga?kategorija=883&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i)
category_of_vehicle = 883
# Request to website
response_for_page = requests.get(main_website, headers=headers)
# Soup object
soup_page = BeautifulSoup(response_for_page.content, 'html.parser')
# Results
results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'})
# List of olx id
olx_id = []
# All filters
filters = {
"Olx_id" : None,
"Kategorija" : None,
"Cijena" : None,
"Stanje" : None,
"Lokacija_kanton" : None,
"Lokacija_grad" : None,
"Brend" : None,
"Godište" : None,
"Kilometraža" : None,
"Vrsta" : None,
"Gorivo" : None,
"Konjskih snaga" : None,
"Kilovata (KW)" : None,
"Kubikaža" : None,
"Masa/Težina (kg)" : None,
"Airbag" : None,
"Broj prozora" : None,
"Mjesta za spavanje" : None,
"Pogon" : None,
"Veličina felgi" : None,
"Transmisija" : None,
"Boja" : None,
"Muzika" : None,
"Otvor na krovu" : None,
"Model" : None,
"Alarm" : None,
"Daljinsko otključavanje" : None,
"Registrovan" : None,
"Metalik" : None,
"Servisna knjiga" : None,
"El. podizači stakala" : None,
"Tempomat" : None,
"Servo volan" : None,
"Komande na volanu" : None,
"Navigacija" : None,
"Ocarinjen" : None,
"Strane tablice" : None,
"Kuhinja" : None,
"Šporet" : None,
"Sudoper" : None,
"Frižider" : None,
"Tenda" : None,
"Kupatilo (Tuš)" : None,
"WC" : None,
"Grijanje" : None,
"Klimatizirano" : None,
"Oštećen" : None,
"Datum" : None,
"Vrijeme" : None
}
# Number of all items
broj_el = 0
# Getting all id's of articles
for i in range(0, len(results_all_items_per_page)):
if(results_all_items_per_page[i].find('p')):
# Divide id from rest of link
address_content = results_all_items_per_page[i].find('a')['href']
temp = address_content.split('/')
artikal_number = temp[4]
olx_id.append(artikal_number)
broj_el = broj_el + 1
for i in range(0, broj_el):
# New dictionary instance for every item
podaci = filters.copy()
# Add kategorija
if (category_of_vehicle == 883): podaci['Kategorija'] = ('Kamperi')
# Artikal olx_link
artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i]
podaci["Olx_id"] = olx_id[i]
response_item = requests.get(artikal_link, headers=headers)
soup_item = BeautifulSoup(response_item.content, 'html.parser')
result_item = soup_item.find('div',{'class':'artikal_lijevo'})
# Getting filters info from item
# Osnovni filteri
# Cijena
if (result_item.find('div',{'id':'pc'})):
x_cijena = result_item.find('div',{'id':'pc'}).findAll('p')
item_cijena = x_cijena[1].get_text().split()[0]
if(item_cijena == 'Po'):
item_cijena = "Po dogovoru"
podaci['Cijena'] = item_cijena
# Lokacija
#kanton
if (result_item.find('div',{'class':'mobile-lokacija'})):
x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split()
item_kanton = x_lokacija[0].replace(',','')
podaci['Lokacija_kanton'] = item_kanton
#grad
x_lokacija.pop(0)
item_grad = x_lokacija
mojstring = ' '.join(item_grad)
podaci['Lokacija_grad'] = mojstring
# Stanje
if (result_item.find('div',{'class':'mobile-stanje'})):
x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split()
item_stanje = x_stanje[1]
podaci['Stanje'] = item_stanje
# Napredni filteri
# Dodatna polja
if (result_item.find_all('div',{'id':'dodatnapolja1'})):
dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'})
for i in range (0,len(dodatnapolja_all_divs)):
df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'})
for j in range (0,len(df_pom)):
df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text()
if (df_pom[j].find('div',{'class','df2'}).find('i')):
df_pom2 = True
else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text()
podaci[df_pom1] = df_pom2
# Vrijeme i datum
if (result_item.find('time', {'class' : 'entry-date'})):
date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split()
datum = date_time_div[0]
vrijeme = date_time_div[2]
podaci["Datum"] = datum
podaci["Vrijeme"] = vrijeme
# Insert datas to database
dictionary_copy = podaci.copy()
podaci_db.append(dictionary_copy)
# ------------- CREATE PANDAS DATAFRAME - DICTIONARY --------------
olx_db = pd.DataFrame(podaci_db) # treba biti niz
# print(olx_db)
olx_db.to_excel('test_kamperi1.xlsx',index=False)

View File

@@ -0,0 +1,228 @@
# ----------Imports------------
from datetime import date
from traceback import print_tb
from unittest import result
from urllib import response
from urllib.request import Request
from bs4 import BeautifulSoup
from matplotlib import dates
from numpy import diag_indices
import requests
import pandas as pd
import random
# List of User-Agent
user_agent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1',
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
]
user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)]
headers = {'User-Agent': user_agent}
# Array of object filteri
podaci_db = []
# Pagination cross webpages
# n is number of pages to crawl
pages_number_to_crawl = 2
for i in range(1,pages_number_to_crawl):
# if kategorija=18 - Automobili
main_website = 'https://www.olx.ba/pretraga?kategorija=883&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i)
category_of_vehicle = 883
# Request to website
response_for_page = requests.get(main_website, headers=headers)
# Soup object
soup_page = BeautifulSoup(response_for_page.content, 'html.parser')
# Results
results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'})
# List of olx id
olx_id = []
# All filters
filters = {
"Olx_id" : None,
"Kategorija" : None,
"Cijena" : None,
"Stanje" : None,
"Lokacija_kanton" : None,
"Lokacija_grad" : None,
"Brend" : None,
"Godište" : None,
"Kilometraža" : None,
"Vrsta" : None,
"Gorivo" : None,
"Konjskih snaga" : None,
"Kilovata (KW)" : None,
"Kubikaža" : None,
"Masa/Težina (kg)" : None,
"Airbag" : None,
"Broj prozora" : None,
"Mjesta za spavanje" : None,
"Pogon" : None,
"Veličina felgi" : None,
"Transmisija" : None,
"Boja" : None,
"Muzika" : None,
"Otvor na krovu" : None,
"Model" : None,
"Alarm" : None,
"Daljinsko otključavanje" : None,
"Registrovan" : None,
"Metalik" : None,
"Servisna knjiga" : None,
"El. podizači stakala" : None,
"Tempomat" : None,
"Servo volan" : None,
"Komande na volanu" : None,
"Navigacija" : None,
"Ocarinjen" : None,
"Strane tablice" : None,
"Kuhinja" : None,
"Šporet" : None,
"Sudoper" : None,
"Frižider" : None,
"Tenda" : None,
"Kupatilo (Tuš)" : None,
"WC" : None,
"Grijanje" : None,
"Klimatizirano" : None,
"Oštećen" : None,
"Datum" : None,
"Vrijeme" : None
}
# Number of all items
broj_el = 0
# Getting all id's of articles
for i in range(0, len(results_all_items_per_page)):
if(results_all_items_per_page[i].find('p')):
# Divide id from rest of link
address_content = results_all_items_per_page[i].find('a')['href']
temp = address_content.split('/')
artikal_number = temp[4]
olx_id.append(artikal_number)
broj_el = broj_el + 1
for i in range(0, broj_el):
# New dictionary instance for every item
podaci = filters.copy()
# Add kategorija
if (category_of_vehicle == 883): podaci['Kategorija'] = ('Kamperi')
# Artikal olx_link
artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i]
podaci["Olx_id"] = olx_id[i]
response_item = requests.get(artikal_link, headers=headers)
soup_item = BeautifulSoup(response_item.content, 'html.parser')
result_item = soup_item.find('div',{'class':'artikal_lijevo'})
# print(artikal_link)
# Getting filters info from item
# Osnovni filteri
# Cijena
if (result_item.find('div',{'id':'pc'})):
x_cijena = result_item.find('div',{'id':'pc'}).findAll('p')
item_cijena = x_cijena[1].get_text().split()[0]
if(item_cijena == 'Po'):
item_cijena = "Po dogovoru"
podaci['Cijena'] = item_cijena
# print(podaci['Cijena'])
# Lokacija
#kanton
if (result_item.find('div',{'class':'mobile-lokacija'})):
x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split()
item_kanton = x_lokacija[0].replace(',','')
podaci['Lokacija_kanton'] = item_kanton
# print(podaci['Lokacija_kanton'])
#grad
x_lokacija.pop(0)
item_grad = x_lokacija
mojstring = ' '.join(item_grad)
podaci['Lokacija_grad'] = mojstring
# print(podaci['Lokacija_grad'])
# Stanje
if (result_item.find('div',{'class':'mobile-stanje'})):
x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split()
item_stanje = x_stanje[1]
podaci['Stanje'] = item_stanje
# print(podaci['Stanje'])
# Napredni filteri
# Dodatna polja
if (result_item.find_all('div',{'id':'dodatnapolja1'})):
dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'})
for i in range (0,len(dodatnapolja_all_divs)):
df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'})
for j in range (0,len(df_pom)):
df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text()
if (df_pom[j].find('div',{'class','df2'}).find('i')):
df_pom2 = True
else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text()
podaci[df_pom1] = df_pom2
# Vrijeme i datum
if (result_item.find('time', {'class' : 'entry-date'})):
date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split()
datum = date_time_div[0]
vrijeme = date_time_div[2]
podaci["Datum"] = datum
podaci["Vrijeme"] = vrijeme
# print(podaci["Datum"], podaci["Vrijeme"])
# Insert datas to database
dictionary_copy = podaci.copy()
podaci_db.append(dictionary_copy)
print('--------------------------------------------------------------------')
# ------------- CREATE PANDAS DATAFRAME - DICTIONARY --------------
olx_db = pd.DataFrame(podaci_db) # treba biti niz
# print(olx_db)
olx_db.to_excel('test_kamperi1.xlsx',index=False)

View File

@@ -0,0 +1,185 @@
from datetime import date
from bs4 import BeautifulSoup
from urllib import response
from urllib import request
import requests
import pandas as pd
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
headers = {'User-Agent': user_agent}
# Array of object filteri
podaci_db = []
# Pagination cross webpages
# n is number of pages to crawl
pages_number_to_crawl = 2
for i in range(1,pages_number_to_crawl):
# if kategorija=18 - Automobili
main_website = 'https://www.olx.ba/pretraga?id=21&kategorija=21&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&kilometra-a_min=0&kilometra-a_max=0&stranica=' + str(i)
category_of_vehicle = 21
# Request to website
response_for_page = requests.get(main_website, headers=headers)
# Soup object
soup_page = BeautifulSoup(response_for_page.content, 'html.parser')
# Results
results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'})
# List of olx id
olx_id = []
# All filters
filters = {
"Olx_id" : None,
"Kategorija" : None,
"Cijena" : None,
"Stanje" : None,
"Lokacija_kanton" : None,
"Lokacija_grad" : None,
"Brend" : None,
"Godište" : None,
"Kilometraža" : None,
"Tip/Vrsta motocikla" : None,
"Kubikaža (ccm)" : None,
"Konjskih snaga" : None,
"Masa/Težina (kg)" : None,
"Način hlađenja" : None,
"Vrsta mašine (broj taktova)" : None,
"Transmisija" : None,
"Zadnja guma (inch)" : None,
"Prednja guma (inch)" : None,
"Boja" : None,
"Model" : None,
"Registrovan" : None,
"Metalik" : None,
"Alarm" : None,
"Ocarinjen" : None,
"Udaren" : None,
"ABS" : None,
"Električni pogon (motor)" : None,
"Xenon svjetla" : None,
"Datum" : None,
"Vrijeme" : None
}
# Number of all items
broj_el = 0
# Getting all id's of articles
for i in range(0, len(results_all_items_per_page)):
if(results_all_items_per_page[i].find('p')):
# Divide id from rest of link
address_content = results_all_items_per_page[i].find('a')['href']
temp = address_content.split('/')
artikal_number = temp[4]
olx_id.append(artikal_number)
broj_el = broj_el + 1
for i in range(0, broj_el):
podaci = filters.copy()
# Add kategorija
if (category_of_vehicle == 21): podaci['Kategorija'] = ('Motocikli')
# Artikal olx_link
artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i]
podaci["Olx_id"] = olx_id[i]
response_item = requests.get(artikal_link, headers=headers)
soup_item = BeautifulSoup(response_item.content, 'html.parser')
result_item = soup_item.find('div',{'class':'artikal_lijevo'})
# print(artikal_link)
# Getting filters info from item
# Osnovni filteri
# Cijena
if (result_item.find('div',{'id':'pc'})):
x_cijena = result_item.find('div',{'id':'pc'}).findAll('p')
item_cijena = x_cijena[1].get_text().split()[0]
if(item_cijena == 'Po'):
item_cijena = "Po dogovoru"
podaci['Cijena'] = item_cijena
# print(podaci['Cijena'])
# Lokacija
#kanton
if (result_item.find('div',{'class':'mobile-lokacija'})):
x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split()
item_kanton = x_lokacija[0].replace(',','')
podaci['Lokacija_kanton'] = item_kanton
# print(podaci['Lokacija_kanton'])
#grad
x_lokacija.pop(0)
item_grad = x_lokacija
mojstring = ' '.join(item_grad)
podaci['Lokacija_grad'] = mojstring
# print(podaci['Lokacija_grad'])
# Brand
if (result_item.find('div',{'itemprop':'brand'})):
x_brend = result_item.find('div',{'itemprop':'brand'}).find('a').get_text()
podaci['Brend'] = x_brend
# Stanje
if (result_item.find('div',{'class':'mobile-stanje'})):
x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split()
item_stanje = x_stanje[1]
podaci['Stanje'] = item_stanje
# print(podaci['Stanje'])
# Dodatna polja
if (result_item.find_all('div',{'id':'dodatnapolja1'})):
dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'})
for i in range (0,len(dodatnapolja_all_divs)):
df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'})
for j in range (0,len(df_pom)):
df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text()
if (df_pom[j].find('div',{'class','df2'}).find('i')):
df_pom2 = True
else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text()
podaci[df_pom1] = df_pom2
#print(df_pom1 + ' : ' + str(df_pom2))
# Vrijeme i datum
if (result_item.find('time', {'class' : 'entry-date'})):
date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split()
datum = date_time_div[0]
vrijeme = date_time_div[2]
podaci["Datum"] = datum
podaci["Vrijeme"] = vrijeme
# print(podaci["Datum"], podaci["Vrijeme"])
# Insert datas to database
dictionary_copy = podaci.copy()
podaci_db.append(dictionary_copy)
# ------------- CREATE PANDAS DATAFRAME - DICTIONARY --------------
olx_db = pd.DataFrame(podaci_db) # treba biti niz
# print(olx_db)
olx_db.to_excel('proba-motocikli.xlsx',index=False)

View File

@@ -0,0 +1,183 @@
from datetime import date
from bs4 import BeautifulSoup
from urllib import response
from urllib import request
import requests
import pandas as pd
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
headers = {'User-Agent': user_agent}
# Array of object filteri
podaci_db = []
# Pagination cross webpages
# n is number of pages to crawl
pages_number_to_crawl = 2
for i in range(1,pages_number_to_crawl):
# if kategorija=18 - Automobili
main_website = 'https://www.olx.ba/pretraga?kategorija=426&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i)
category_of_vehicle = 426
# Request to website
response_for_page = requests.get(main_website, headers=headers)
# Soup object
soup_page = BeautifulSoup(response_for_page.content, 'html.parser')
# Results
results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'})
# List of olx id
olx_id = []
# All filters
filters = {
"Olx_id" : None,
"Kategorija" : None,
"Cijena" : None,
"Stanje" : None,
"Lokacija_kanton" : None,
"Lokacija_grad" : None,
"Brend" : None,
"Dužina (m)" : None,
"Širina (m)" : None,
"Vrsta broda" : None,
"Vrsta pogona (osnovni)" : None,
"Godište" : None,
"Model" : None,
"Visina (m)" : None,
"Alternativni pogon" : None,
"Jačina motora (KS)" : None,
"Radni sati motora" : None,
"Max. brzina (čvor)" : None,
"Masa (kg)" : None,
"Nosivost (t)" : None,
"Deplasman (t)" : None,
"Broj ležaja" : None,
"Broj prostorija/soba" : None,
"Spremnik goriva (L)" : None,
"Spremnik vode (L)" : None,
"Materijal izgradnje" : None,
"Kapacitet putnika" : None,
"Audio/Video" : None,
"Bitve za vezanje" : None,
"Motor (model)" : None,
"Gaz" : None,
"Klima" : None,
"Vjetrobran" : None,
"WC" : None,
"Datum" : None,
"Vrijeme" : None
}
# Number of all items
broj_el = 0
# Getting all id's of articles
for i in range(0, len(results_all_items_per_page)):
if(results_all_items_per_page[i].find('p')):
# Divide id from rest of link
address_content = results_all_items_per_page[i].find('a')['href']
temp = address_content.split('/')
artikal_number = temp[4]
olx_id.append(artikal_number)
broj_el = broj_el + 1
for i in range(0, broj_el):
podaci = filters.copy()
# Add kategorija
if (category_of_vehicle == 426): podaci['Kategorija'] = ('Nautika')
# Artikal olx_link
artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i]
podaci["Olx_id"] = olx_id[i]
response_item = requests.get(artikal_link, headers=headers)
soup_item = BeautifulSoup(response_item.content, 'html.parser')
result_item = soup_item.find('div',{'class':'artikal_lijevo'})
# Getting filters info from item
# Osnovni filteri
# Cijena
if (result_item.find('div',{'id':'pc'})):
x_cijena = result_item.find('div',{'id':'pc'}).findAll('p')
item_cijena = x_cijena[1].get_text().split()[0]
if(item_cijena == 'Po'):
item_cijena = "Po dogovoru"
podaci['Cijena'] = item_cijena
# Lokacija
#kanton
if (result_item.find('div',{'class':'mobile-lokacija'})):
x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split()
item_kanton = x_lokacija[0].replace(',','')
podaci['Lokacija_kanton'] = item_kanton
#grad
x_lokacija.pop(0)
item_grad = x_lokacija
mojstring = ' '.join(item_grad)
podaci['Lokacija_grad'] = mojstring
# Brand
if (result_item.find('div',{'itemprop':'brand'})):
x_brend = result_item.find('div',{'itemprop':'brand'}).find('a').get_text()
# Stanje
if (result_item.find('div',{'class':'mobile-stanje'})):
x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split()
item_stanje = x_stanje[1]
podaci['Stanje'] = item_stanje
# Dodatna polja
if (result_item.find_all('div',{'id':'dodatnapolja1'})):
dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'})
for i in range (0,len(dodatnapolja_all_divs)):
df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'})
for j in range (0,len(df_pom)):
df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text()
if (df_pom[j].find('div',{'class','df2'}).find('i')):
df_pom2 = True
else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text()
podaci[df_pom1] = df_pom2
# Vrijeme i datum
if (result_item.find('time', {'class' : 'entry-date'})):
date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split()
datum = date_time_div[0]
vrijeme = date_time_div[2]
podaci["Datum"] = datum
podaci["Vrijeme"] = vrijeme
# Insert datas to database
dictionary_copy = podaci.copy()
podaci_db.append(dictionary_copy)
# ------------- CREATE PANDAS DATAFRAME - DICTIONARY --------------
olx_db = pd.DataFrame(podaci_db) # treba biti niz
# print(olx_db)
olx_db.to_excel('test-nautika1.xlsx',index=False)

View File

@@ -0,0 +1,191 @@
from datetime import date
from bs4 import BeautifulSoup
from urllib import response
from urllib import request
import requests
import pandas as pd
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
headers = {'User-Agent': user_agent}
# Array of object filteri
podaci_db = []
# Pagination cross webpages
# n is number of pages to crawl
pages_number_to_crawl = 2
for i in range(1,pages_number_to_crawl):
# if kategorija=18 - Automobili
main_website = 'https://www.olx.ba/pretraga?kategorija=426&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i)
category_of_vehicle = 426
# Request to website
response_for_page = requests.get(main_website, headers=headers)
# Soup object
soup_page = BeautifulSoup(response_for_page.content, 'html.parser')
# Results
results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'})
# List of olx id
olx_id = []
# All filters
filters = {
"Olx_id" : None,
"Kategorija" : None,
"Cijena" : None,
"Stanje" : None,
"Lokacija_kanton" : None,
"Lokacija_grad" : None,
"Brend" : None,
"Dužina (m)" : None,
"Širina (m)" : None,
"Vrsta broda" : None,
"Vrsta pogona (osnovni)" : None,
"Godište" : None,
"Model" : None,
"Visina (m)" : None,
"Alternativni pogon" : None,
"Jačina motora (KS)" : None,
"Radni sati motora" : None,
"Max. brzina (čvor)" : None,
"Masa (kg)" : None,
"Nosivost (t)" : None,
"Deplasman (t)" : None,
"Broj ležaja" : None,
"Broj prostorija/soba" : None,
"Spremnik goriva (L)" : None,
"Spremnik vode (L)" : None,
"Materijal izgradnje" : None,
"Kapacitet putnika" : None,
"Audio/Video" : None,
"Bitve za vezanje" : None,
"Motor (model)" : None,
"Gaz" : None,
"Klima" : None,
"Vjetrobran" : None,
"WC" : None,
"Datum" : None,
"Vrijeme" : None
}
# Number of all items
broj_el = 0
# Getting all id's of articles
for i in range(0, len(results_all_items_per_page)):
if(results_all_items_per_page[i].find('p')):
# Divide id from rest of link
address_content = results_all_items_per_page[i].find('a')['href']
temp = address_content.split('/')
artikal_number = temp[4]
olx_id.append(artikal_number)
broj_el = broj_el + 1
for i in range(0, broj_el):
podaci = filters.copy()
# Add kategorija
if (category_of_vehicle == 426): podaci['Kategorija'] = ('Nautika')
# Artikal olx_link
artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i]
podaci["Olx_id"] = olx_id[i]
response_item = requests.get(artikal_link, headers=headers)
soup_item = BeautifulSoup(response_item.content, 'html.parser')
result_item = soup_item.find('div',{'class':'artikal_lijevo'})
print(artikal_link)
# Getting filters info from item
# Osnovni filteri
# Cijena
if (result_item.find('div',{'id':'pc'})):
x_cijena = result_item.find('div',{'id':'pc'}).findAll('p')
item_cijena = x_cijena[1].get_text().split()[0]
if(item_cijena == 'Po'):
item_cijena = "Po dogovoru"
podaci['Cijena'] = item_cijena
print(podaci['Cijena'])
# Lokacija
#kanton
if (result_item.find('div',{'class':'mobile-lokacija'})):
x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split()
item_kanton = x_lokacija[0].replace(',','')
podaci['Lokacija_kanton'] = item_kanton
print(podaci['Lokacija_kanton'])
#grad
x_lokacija.pop(0)
item_grad = x_lokacija
mojstring = ' '.join(item_grad)
podaci['Lokacija_grad'] = mojstring
print(podaci['Lokacija_grad'])
# Brand
if (result_item.find('div',{'itemprop':'brand'})):
x_brend = result_item.find('div',{'itemprop':'brand'}).find('a').get_text()
podaci['Brend'] = x_brend
# Stanje
if (result_item.find('div',{'class':'mobile-stanje'})):
x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split()
item_stanje = x_stanje[1]
podaci['Stanje'] = item_stanje
print(podaci['Stanje'])
# Dodatna polja
if (result_item.find_all('div',{'id':'dodatnapolja1'})):
dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'})
for i in range (0,len(dodatnapolja_all_divs)):
df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'})
for j in range (0,len(df_pom)):
df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text()
if (df_pom[j].find('div',{'class','df2'}).find('i')):
df_pom2 = True
else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text()
podaci[df_pom1] = df_pom2
#print(df_pom1 + ' : ' + str(df_pom2))
# Vrijeme i datum
if (result_item.find('time', {'class' : 'entry-date'})):
date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split()
datum = date_time_div[0]
vrijeme = date_time_div[2]
podaci["Datum"] = datum
podaci["Vrijeme"] = vrijeme
# print(podaci["Datum"], podaci["Vrijeme"])
# Insert datas to database
dictionary_copy = podaci.copy()
podaci_db.append(dictionary_copy)
# ------------- CREATE PANDAS DATAFRAME - DICTIONARY --------------
olx_db = pd.DataFrame(podaci_db) # treba biti niz
# print(olx_db)
olx_db.to_excel('test-nautika1.xlsx',index=False)

View File

@@ -0,0 +1,178 @@
# ----------Imports------------
from datetime import date
from traceback import print_tb
from unittest import result
from urllib import response
from urllib.request import Request
from bs4 import BeautifulSoup
from matplotlib import dates
from numpy import diag_indices
import requests
import pandas as pd
import random
# List of User-Agent
user_agent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1',
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
]
user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)]
headers = {'User-Agent': user_agent}
# Array of object filteri
podaci_db = []
# Pagination cross webpages
# n is number of pages to crawl
pages_number_to_crawl = 2
for i in range(1,pages_number_to_crawl):
# if kategorija=18 - Automobili
main_website = 'https://www.olx.ba/pretraga?kategorija=884&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&stranica=' + str(i)
category_of_vehicle = 884
# Request to website
response_for_page = requests.get(main_website, headers=headers)
# Soup object
soup_page = BeautifulSoup(response_for_page.content, 'html.parser')
# Results
results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'})
# List of olx id
olx_id = []
# All filters
filters = {
"Olx_id" : None,
"Kategorija" : None,
"Cijena" : None,
"Stanje" : None,
"Lokacija_kanton" : None,
"Lokacija_grad" : None,
"Brend" : None,
"Nosivost (kg)" : None,
"Vrsta" : None,
"Godina proizvodnje" : None,
"Masa/Težina (kg)" : None,
"Boja" : None,
"Model" : None,
"Zasebne kočnice" : None,
"Registrovana" : None,
"Ocarinjena" : None,
"Datum" : None,
"Vrijeme" : None
}
# Number of all items
broj_el = 0
# Getting all id's of articles
for i in range(0, len(results_all_items_per_page)):
if(results_all_items_per_page[i].find('p')):
# Divide id from rest of link
address_content = results_all_items_per_page[i].find('a')['href']
temp = address_content.split('/')
artikal_number = temp[4]
olx_id.append(artikal_number)
broj_el = broj_el + 1
for i in range(0, broj_el):
# New dictionary instance for every item
podaci = filters.copy()
# Add kategorija
if (category_of_vehicle == 884): podaci['Kategorija'] = ('Prikolice')
# Artikal olx_link
artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i]
podaci["Olx_id"] = olx_id[i]
response_item = requests.get(artikal_link, headers=headers)
soup_item = BeautifulSoup(response_item.content, 'html.parser')
result_item = soup_item.find('div',{'class':'artikal_lijevo'})
# Getting filters info from item
# Osnovni filteri
# Cijena
if (result_item.find('div',{'id':'pc'})):
x_cijena = result_item.find('div',{'id':'pc'}).findAll('p')
item_cijena = x_cijena[1].get_text().split()[0]
if(item_cijena == 'Po'):
item_cijena = "Po dogovoru"
podaci['Cijena'] = item_cijena
# Lokacija
#kanton
if (result_item.find('div',{'class':'mobile-lokacija'})):
x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split()
item_kanton = x_lokacija[0].replace(',','')
podaci['Lokacija_kanton'] = item_kanton
#grad
x_lokacija.pop(0)
item_grad = x_lokacija
mojstring = ' '.join(item_grad)
podaci['Lokacija_grad'] = mojstring
# Stanje
if (result_item.find('div',{'class':'mobile-stanje'})):
x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split()
item_stanje = x_stanje[1]
podaci['Stanje'] = item_stanje
# Brand
if (result_item.find_all('div',{'itemprop':'brand'})):
x_brand = result_item.find('div',{'itemprop':'brand'}).find('a').get_text()
podaci['Brend'] = x_brand
# Napredni filteri
# Dodatna polja
if (result_item.find_all('div',{'id':'dodatnapolja1'})):
dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'})
for i in range (0,len(dodatnapolja_all_divs)):
df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'})
for j in range (0,len(df_pom)):
df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text()
if (df_pom[j].find('div',{'class','df2'}).find('i')):
df_pom2 = True
else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text()
podaci[df_pom1] = df_pom2
# Vrijeme i datum
if (result_item.find('time', {'class' : 'entry-date'})):
date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split()
datum = date_time_div[0]
vrijeme = date_time_div[2]
podaci["Datum"] = datum
podaci["Vrijeme"] = vrijeme
# Insert datas to database
dictionary_copy = podaci.copy()
podaci_db.append(dictionary_copy)
# ------------- CREATE PANDAS DATAFRAME - DICTIONARY --------------
olx_db = pd.DataFrame(podaci_db) # treba biti niz
# print(olx_db)
olx_db.to_excel('test_prikolica1.xlsx',index=False)
print("Zavrseno!!")

View File

@@ -0,0 +1,187 @@
# ----------Imports------------
from datetime import date
from traceback import print_tb
from unittest import result
from urllib import response
from urllib.request import Request
from bs4 import BeautifulSoup
from matplotlib import dates
from numpy import diag_indices
import requests
import pandas as pd
import random
# List of User-Agent
user_agent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1',
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
]
user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)]
headers = {'User-Agent': user_agent}
# Array of object filteri
podaci_db = []
# Pagination cross webpages
# n is number of pages to crawl
pages_number_to_crawl = 2
for i in range(1,pages_number_to_crawl):
# if kategorija=18 - Automobili
main_website = 'https://www.olx.ba/pretraga?kategorija=884&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&stranica=' + str(i)
category_of_vehicle = 884
# Request to website
response_for_page = requests.get(main_website, headers=headers)
# Soup object
soup_page = BeautifulSoup(response_for_page.content, 'html.parser')
# Results
results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'})
# List of olx id
olx_id = []
# All filters
filters = {
"Olx_id" : None,
"Kategorija" : None,
"Cijena" : None,
"Stanje" : None,
"Lokacija_kanton" : None,
"Lokacija_grad" : None,
"Brend" : None,
"Nosivost (kg)" : None,
"Vrsta" : None,
"Godina proizvodnje" : None,
"Masa/Težina (kg)" : None,
"Boja" : None,
"Model" : None,
"Zasebne kočnice" : None,
"Registrovana" : None,
"Ocarinjena" : None,
"Datum" : None,
"Vrijeme" : None
}
# Number of all items
broj_el = 0
# Getting all id's of articles
for i in range(0, len(results_all_items_per_page)):
if(results_all_items_per_page[i].find('p')):
# Divide id from rest of link
address_content = results_all_items_per_page[i].find('a')['href']
temp = address_content.split('/')
artikal_number = temp[4]
olx_id.append(artikal_number)
broj_el = broj_el + 1
for i in range(0, broj_el):
# New dictionary instance for every item
podaci = filters.copy()
# Add kategorija
if (category_of_vehicle == 884): podaci['Kategorija'] = ('Prikolice')
# Artikal olx_link
artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i]
podaci["Olx_id"] = olx_id[i]
response_item = requests.get(artikal_link, headers=headers)
soup_item = BeautifulSoup(response_item.content, 'html.parser')
result_item = soup_item.find('div',{'class':'artikal_lijevo'})
# print(artikal_link)
# Getting filters info from item
# Osnovni filteri
# Cijena
if (result_item.find('div',{'id':'pc'})):
x_cijena = result_item.find('div',{'id':'pc'}).findAll('p')
item_cijena = x_cijena[1].get_text().split()[0]
if(item_cijena == 'Po'):
item_cijena = "Po dogovoru"
podaci['Cijena'] = item_cijena
# print(podaci['Cijena'])
# Lokacija
#kanton
if (result_item.find('div',{'class':'mobile-lokacija'})):
x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split()
item_kanton = x_lokacija[0].replace(',','')
podaci['Lokacija_kanton'] = item_kanton
# print(podaci['Lokacija_kanton'])
#grad
x_lokacija.pop(0)
item_grad = x_lokacija
mojstring = ' '.join(item_grad)
podaci['Lokacija_grad'] = mojstring
# print(podaci['Lokacija_grad'])
# Stanje
if (result_item.find('div',{'class':'mobile-stanje'})):
x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split()
item_stanje = x_stanje[1]
podaci['Stanje'] = item_stanje
# print(podaci['Stanje'])
#--------------------------------------------------------------------------------------
# Brand
if (result_item.find_all('div',{'itemprop':'brand'})):
x_brand = result_item.find('div',{'itemprop':'brand'}).find('a').get_text()
podaci['Brend'] = x_brand
# print(x_brand)
# Napredni filteri
# Dodatna polja
if (result_item.find_all('div',{'id':'dodatnapolja1'})):
dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'})
for i in range (0,len(dodatnapolja_all_divs)):
df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'})
for j in range (0,len(df_pom)):
df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text()
if (df_pom[j].find('div',{'class','df2'}).find('i')):
df_pom2 = True
else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text()
podaci[df_pom1] = df_pom2
# Vrijeme i datum
if (result_item.find('time', {'class' : 'entry-date'})):
date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split()
datum = date_time_div[0]
vrijeme = date_time_div[2]
podaci["Datum"] = datum
podaci["Vrijeme"] = vrijeme
# print(podaci["Datum"], podaci["Vrijeme"])
# Insert datas to database
dictionary_copy = podaci.copy()
podaci_db.append(dictionary_copy)
# ------------- CREATE PANDAS DATAFRAME - DICTIONARY --------------
olx_db = pd.DataFrame(podaci_db) # treba biti niz
# print(olx_db)
olx_db.to_excel('test_prikolica1.xlsx',index=False)
print("Zavrseno!!")

View File

@@ -0,0 +1,223 @@
# ----------Imports------------
from datetime import date
from traceback import print_tb
from unittest import result
from urllib import response
from urllib.request import Request
from bs4 import BeautifulSoup
from matplotlib import dates
from numpy import diag_indices
import requests
import pandas as pd
import random
# List of User-Agent
user_agent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1',
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
]
user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)]
headers = {'User-Agent': user_agent}
# Array of object filteri
podaci_db = []
# Pagination cross webpages
# n is number of pages to crawl
pages_number_to_crawl = 2
for i in range(1,pages_number_to_crawl):
# if kategorija=18 - Automobili
main_website = 'https://www.olx.ba/pretraga?kategorija=20&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i)
category_of_vehicle = 20
# Request to website
response_for_page = requests.get(main_website, headers=headers)
# Soup object
soup_page = BeautifulSoup(response_for_page.content, 'html.parser')
# Results
results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'})
# List of olx id
olx_id = []
# All filters
filters = {
"Olx_id" : None,
"Kategorija" : None,
"Cijena" : None,
"Stanje" : None,
"Lokacija_kanton" : None,
"Lokacija_grad" : None,
"Brend" : None,
"Godište" : None,
"Kilometraža" : None,
"Tip" : None,
"Broj osovina" : None,
"Gorivo" : None,
"Konjskih snaga" : None,
"Kilovata (KW)" : None,
"Masa/Težina (kg)" : None,
"Ukupna dozvoljena masa (t)" : None,
"Dužina tovarnog prostora" : None,
"Širina tovarnog prostora" : None,
"Visina tovarnog prostora" : None,
"Emisioni standard" : None,
"Vrsta pogona" : None,
"Transmisija" : None,
"Nosivost (tona)" : None,
"Boja" : None,
"Muzika / ozvučenje" : None,
"Registrovan do" : None,
"Model" : None,
"Strane tablice" : None,
"Sa kranom" : None,
"Metalik" : None,
"Udaren" : None,
"Registrovan/Ocarinjen" : None,
"Servisna knjiga" : None,
"Servo volan" : None,
"El. podizači stakala" : None,
"Električni retrovizori" : None,
"Klima" : None,
"Navigacija" : None,
"Koža" : None,
"Xenon svjetla" : None,
"Alarm" : None,
"Daljinsko otključavanje" : None,
"Centralna brava" : None,
"Dupla kabina" : None,
"Datum" : None,
"Vrijeme" : None
}
# Number of all items
broj_el = 0
# Getting all id's of articles
for i in range(0, len(results_all_items_per_page)):
if(results_all_items_per_page[i].find('p')):
# Divide id from rest of link
address_content = results_all_items_per_page[i].find('a')['href']
temp = address_content.split('/')
artikal_number = temp[4]
olx_id.append(artikal_number)
broj_el = broj_el + 1
for i in range(0, broj_el):
# New dictionary instance for every item
podaci = filters.copy()
# Add kategorija
if (category_of_vehicle == 20): podaci['Kategorija'] = ('Teretna vozila')
# Artikal olx_link
artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i]
podaci["Olx_id"] = olx_id[i]
response_item = requests.get(artikal_link, headers=headers)
soup_item = BeautifulSoup(response_item.content, 'html.parser')
result_item = soup_item.find('div',{'class':'artikal_lijevo'})
# print(artikal_link)
# Getting filters info from item
# Osnovni filteri
# Cijena
if (result_item.find('div',{'id':'pc'})):
x_cijena = result_item.find('div',{'id':'pc'}).findAll('p')
item_cijena = x_cijena[1].get_text().split()[0]
if(item_cijena == 'Po'):
item_cijena = "Po dogovoru"
podaci['Cijena'] = item_cijena
# Lokacija
#kanton
if (result_item.find('div',{'class':'mobile-lokacija'})):
x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split()
item_kanton = x_lokacija[0].replace(',','')
podaci['Lokacija_kanton'] = item_kanton
#grad
x_lokacija.pop(0)
item_grad = x_lokacija
mojstring = ' '.join(item_grad)
podaci['Lokacija_grad'] = mojstring
# Stanje
if (result_item.find('div',{'class':'mobile-stanje'})):
x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split()
item_stanje = x_stanje[1]
podaci['Stanje'] = item_stanje
# Brand
if (result_item.find_all('div',{'itemprop':'brand'})):
x_brand = result_item.find('div',{'itemprop':'brand'}).find('a').get_text()
podaci['Brend'] = x_brand
# Napredni filteri
# Dodatna polja
if (result_item.find_all('div',{'id':'dodatnapolja1'})):
dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'})
for i in range (0,len(dodatnapolja_all_divs)):
df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'})
for j in range (0,len(df_pom)):
df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text()
if (df_pom[j].find('div',{'class','df2'}).find('i')):
df_pom2 = True
else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text()
podaci[df_pom1] = df_pom2
# Vrijeme i datum
if (result_item.find('time', {'class' : 'entry-date'})):
date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split()
datum = date_time_div[0]
vrijeme = date_time_div[2]
podaci["Datum"] = datum
podaci["Vrijeme"] = vrijeme
# Insert datas to database
dictionary_copy = podaci.copy()
podaci_db.append(dictionary_copy)
# ------------- CREATE PANDAS DATAFRAME - DICTIONARY --------------
olx_db = pd.DataFrame(podaci_db) # treba biti niz
# print(olx_db)
olx_db.to_excel('test_teretna2.xlsx',index=False)
print("Zavrseno!!")

View File

@@ -0,0 +1,240 @@
# ----------Imports------------
from datetime import date
from traceback import print_tb
from unittest import result
from urllib import response
from urllib.request import Request
from bs4 import BeautifulSoup
from matplotlib import dates
from numpy import diag_indices
import requests
import pandas as pd
import random
# List of User-Agent
user_agent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1',
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
]
user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)]
headers = {'User-Agent': user_agent}
# Array of object filteri
podaci_db = []
# Pagination cross webpages
# n is number of pages to crawl
pages_number_to_crawl = 2
for i in range(1,pages_number_to_crawl):
# if kategorija=18 - Automobili
main_website = 'https://www.olx.ba/pretraga?kategorija=20&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i)
category_of_vehicle = 20
# Request to website
response_for_page = requests.get(main_website, headers=headers)
# Soup object
soup_page = BeautifulSoup(response_for_page.content, 'html.parser')
# Results
results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'})
# List of olx id
olx_id = []
# All filters
filters = {
"Olx_id" : None,
"Kategorija" : None,
"Cijena" : None,
"Stanje" : None,
"Lokacija_kanton" : None,
"Lokacija_grad" : None,
"Brend" : None,
"Godište" : None,
"Kilometraža" : None,
"Tip" : None,
"Broj osovina" : None,
"Gorivo" : None,
"Konjskih snaga" : None,
"Kilovata (KW)" : None,
"Masa/Težina (kg)" : None,
"Ukupna dozvoljena masa (t)" : None,
"Dužina tovarnog prostora" : None,
"Širina tovarnog prostora" : None,
"Visina tovarnog prostora" : None,
"Emisioni standard" : None,
"Vrsta pogona" : None,
"Transmisija" : None,
"Nosivost (tona)" : None,
"Boja" : None,
"Muzika / ozvučenje" : None,
"Registrovan do" : None,
"Model" : None,
"Strane tablice" : None,
"Sa kranom" : None,
"Metalik" : None,
"Udaren" : None,
"Registrovan/Ocarinjen" : None,
"Servisna knjiga" : None,
"Servo volan" : None,
"El. podizači stakala" : None,
"Električni retrovizori" : None,
"Klima" : None,
"Navigacija" : None,
"Koža" : None,
"Xenon svjetla" : None,
"Alarm" : None,
"Daljinsko otključavanje" : None,
"Centralna brava" : None,
"Dupla kabina" : None,
"Datum" : None,
"Vrijeme" : None
}
# Number of all items
broj_el = 0
# Getting all id's of articles
for i in range(0, len(results_all_items_per_page)):
if(results_all_items_per_page[i].find('p')):
# Divide id from rest of link
address_content = results_all_items_per_page[i].find('a')['href']
temp = address_content.split('/')
artikal_number = temp[4]
olx_id.append(artikal_number)
broj_el = broj_el + 1
for i in range(0, broj_el):
# New dictionary instance for every item
podaci = filters.copy()
# Add kategorija
if (category_of_vehicle == 20): podaci['Kategorija'] = ('Teretna vozila')
# Artikal olx_link
artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i]
podaci["Olx_id"] = olx_id[i]
response_item = requests.get(artikal_link, headers=headers)
soup_item = BeautifulSoup(response_item.content, 'html.parser')
result_item = soup_item.find('div',{'class':'artikal_lijevo'})
# print(artikal_link)
# Getting filters info from item
# Osnovni filteri
# Cijena
if (result_item.find('div',{'id':'pc'})):
x_cijena = result_item.find('div',{'id':'pc'}).findAll('p')
item_cijena = x_cijena[1].get_text().split()[0]
if(item_cijena == 'Po'):
item_cijena = "Po dogovoru"
podaci['Cijena'] = item_cijena
# print(podaci['Cijena'])
# Lokacija
#kanton
if (result_item.find('div',{'class':'mobile-lokacija'})):
x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split()
item_kanton = x_lokacija[0].replace(',','')
podaci['Lokacija_kanton'] = item_kanton
# print(podaci['Lokacija_kanton'])
#grad
x_lokacija.pop(0)
item_grad = x_lokacija
mojstring = ' '.join(item_grad)
podaci['Lokacija_grad'] = mojstring
# print(podaci['Lokacija_grad'])
# Stanje
if (result_item.find('div',{'class':'mobile-stanje'})):
x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split()
item_stanje = x_stanje[1]
podaci['Stanje'] = item_stanje
# print(podaci['Stanje'])
#--------------------------------------------------------------------------------------
# Brand
if (result_item.find_all('div',{'itemprop':'brand'})):
x_brand = result_item.find('div',{'itemprop':'brand'}).find('a').get_text()
podaci['Brend'] = x_brand
# print(x_brand)
# Napredni filteri
# Dodatna polja
if (result_item.find_all('div',{'id':'dodatnapolja1'})):
dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'})
for i in range (0,len(dodatnapolja_all_divs)):
df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'})
for j in range (0,len(df_pom)):
df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text()
if (df_pom[j].find('div',{'class','df2'}).find('i')):
df_pom2 = True
else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text()
podaci[df_pom1] = df_pom2
# KW single
# kw = podaci['Kilovata (KW)'].split()[0]
# podaci['Kilovata (KW)'] = kw
# print(podaci['Kilovata (KW)'])
# Vrijeme i datum
if (result_item.find('time', {'class' : 'entry-date'})):
date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split()
datum = date_time_div[0]
vrijeme = date_time_div[2]
podaci["Datum"] = datum
podaci["Vrijeme"] = vrijeme
# print(podaci["Datum"], podaci["Vrijeme"])
print('.....................................................')
# Insert datas to database
dictionary_copy = podaci.copy()
podaci_db.append(dictionary_copy)
# ------------- CREATE PANDAS DATAFRAME - DICTIONARY --------------
olx_db = pd.DataFrame(podaci_db) # treba biti niz
# print(olx_db)
olx_db.to_excel('test_teretna2.xlsx',index=False)
print("Zavrseno!!")