224 lines
7.0 KiB
Python
224 lines
7.0 KiB
Python
# ----------Imports------------
|
|
from datetime import date
|
|
from traceback import print_tb
|
|
from unittest import result
|
|
from urllib import response
|
|
from urllib.request import Request
|
|
from bs4 import BeautifulSoup
|
|
from matplotlib import dates
|
|
from numpy import diag_indices
|
|
import requests
|
|
import pandas as pd
|
|
import random
|
|
|
|
# List of User-Agent
|
|
user_agent_list = [
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
|
|
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1',
|
|
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
|
|
]
|
|
|
|
user_agent = user_agent_list[random.randint(0, len(user_agent_list)-1)]
|
|
headers = {'User-Agent': user_agent}
|
|
|
|
# Array of object filteri
|
|
|
|
podaci_db = []
|
|
|
|
# Pagination cross webpages
|
|
# n is number of pages to crawl
|
|
pages_number_to_crawl = 2
|
|
for i in range(1,pages_number_to_crawl):
|
|
# if kategorija=18 - Automobili
|
|
main_website = 'https://www.olx.ba/pretraga?kategorija=20&id=1&stanje=0&vrstapregleda=tabela&sort_order=desc&sort_po=datum&stranica=' + str(i)
|
|
category_of_vehicle = 20
|
|
|
|
# Request to website
|
|
response_for_page = requests.get(main_website, headers=headers)
|
|
|
|
# Soup object
|
|
soup_page = BeautifulSoup(response_for_page.content, 'html.parser')
|
|
|
|
# Results
|
|
results_all_items_per_page = soup_page.find_all('div',{'class':'listitem'})
|
|
|
|
# List of olx id
|
|
olx_id = []
|
|
|
|
# All filters
|
|
filters = {
|
|
"Olx_id" : None,
|
|
"Kategorija" : None,
|
|
"Cijena" : None,
|
|
"Stanje" : None,
|
|
"Lokacija_kanton" : None,
|
|
"Lokacija_grad" : None,
|
|
"Brend" : None,
|
|
"Godište" : None,
|
|
"Kilometraža" : None,
|
|
"Tip" : None,
|
|
"Broj osovina" : None,
|
|
"Gorivo" : None,
|
|
"Konjskih snaga" : None,
|
|
"Kilovata (KW)" : None,
|
|
"Masa/Težina (kg)" : None,
|
|
"Ukupna dozvoljena masa (t)" : None,
|
|
"Dužina tovarnog prostora" : None,
|
|
"Širina tovarnog prostora" : None,
|
|
"Visina tovarnog prostora" : None,
|
|
"Emisioni standard" : None,
|
|
"Vrsta pogona" : None,
|
|
"Transmisija" : None,
|
|
"Nosivost (tona)" : None,
|
|
"Boja" : None,
|
|
"Muzika / ozvučenje" : None,
|
|
"Registrovan do" : None,
|
|
"Model" : None,
|
|
"Strane tablice" : None,
|
|
"Sa kranom" : None,
|
|
"Metalik" : None,
|
|
"Udaren" : None,
|
|
"Registrovan/Ocarinjen" : None,
|
|
"Servisna knjiga" : None,
|
|
"Servo volan" : None,
|
|
"El. podizači stakala" : None,
|
|
"Električni retrovizori" : None,
|
|
"Klima" : None,
|
|
"Navigacija" : None,
|
|
"Koža" : None,
|
|
"Xenon svjetla" : None,
|
|
"Alarm" : None,
|
|
"Daljinsko otključavanje" : None,
|
|
"Centralna brava" : None,
|
|
"Dupla kabina" : None,
|
|
"Datum" : None,
|
|
"Vrijeme" : None
|
|
}
|
|
|
|
# Number of all items
|
|
broj_el = 0
|
|
|
|
# Getting all id's of articles
|
|
for i in range(0, len(results_all_items_per_page)):
|
|
if(results_all_items_per_page[i].find('p')):
|
|
# Divide id from rest of link
|
|
address_content = results_all_items_per_page[i].find('a')['href']
|
|
temp = address_content.split('/')
|
|
artikal_number = temp[4]
|
|
olx_id.append(artikal_number)
|
|
broj_el = broj_el + 1
|
|
|
|
for i in range(0, broj_el):
|
|
|
|
# New dictionary instance for every item
|
|
podaci = filters.copy()
|
|
|
|
# Add kategorija
|
|
if (category_of_vehicle == 20): podaci['Kategorija'] = ('Teretna vozila')
|
|
|
|
# Artikal olx_link
|
|
artikal_link = 'https://www.olx.ba/artikal/' + olx_id[i]
|
|
podaci["Olx_id"] = olx_id[i]
|
|
response_item = requests.get(artikal_link, headers=headers)
|
|
soup_item = BeautifulSoup(response_item.content, 'html.parser')
|
|
result_item = soup_item.find('div',{'class':'artikal_lijevo'})
|
|
# print(artikal_link)
|
|
|
|
|
|
# Getting filters info from item
|
|
|
|
# Osnovni filteri
|
|
|
|
# Cijena
|
|
if (result_item.find('div',{'id':'pc'})):
|
|
x_cijena = result_item.find('div',{'id':'pc'}).findAll('p')
|
|
item_cijena = x_cijena[1].get_text().split()[0]
|
|
if(item_cijena == 'Po'):
|
|
item_cijena = "Po dogovoru"
|
|
podaci['Cijena'] = item_cijena
|
|
|
|
# Lokacija
|
|
#kanton
|
|
if (result_item.find('div',{'class':'mobile-lokacija'})):
|
|
x_lokacija = result_item.find('div',{'class':'mobile-lokacija'})['data-content'].split()
|
|
item_kanton = x_lokacija[0].replace(',','')
|
|
podaci['Lokacija_kanton'] = item_kanton
|
|
|
|
#grad
|
|
x_lokacija.pop(0)
|
|
item_grad = x_lokacija
|
|
mojstring = ' '.join(item_grad)
|
|
podaci['Lokacija_grad'] = mojstring
|
|
|
|
# Stanje
|
|
if (result_item.find('div',{'class':'mobile-stanje'})):
|
|
x_stanje = result_item.find('div',{'class':'mobile-stanje'}).get_text().split()
|
|
item_stanje = x_stanje[1]
|
|
podaci['Stanje'] = item_stanje
|
|
|
|
# Brand
|
|
if (result_item.find_all('div',{'itemprop':'brand'})):
|
|
x_brand = result_item.find('div',{'itemprop':'brand'}).find('a').get_text()
|
|
podaci['Brend'] = x_brand
|
|
|
|
|
|
# Napredni filteri
|
|
|
|
# Dodatna polja
|
|
if (result_item.find_all('div',{'id':'dodatnapolja1'})):
|
|
dodatnapolja_all_divs = result_item.find_all('div',{'id':'dodatnapolja1'})
|
|
for i in range (0,len(dodatnapolja_all_divs)):
|
|
df_pom = dodatnapolja_all_divs[i].find_all('div',{'class','df'})
|
|
for j in range (0,len(df_pom)):
|
|
df_pom1 = df_pom[j].find('div',{'class','df1'}).get_text()
|
|
if (df_pom[j].find('div',{'class','df2'}).find('i')):
|
|
df_pom2 = True
|
|
else : df_pom2 = df_pom[j].find('div',{'class','df2'}).get_text()
|
|
podaci[df_pom1] = df_pom2
|
|
|
|
|
|
# Vrijeme i datum
|
|
if (result_item.find('time', {'class' : 'entry-date'})):
|
|
date_time_div = result_item.find('time', {'class' : 'entry-date'}).get_text().split()
|
|
datum = date_time_div[0]
|
|
vrijeme = date_time_div[2]
|
|
podaci["Datum"] = datum
|
|
podaci["Vrijeme"] = vrijeme
|
|
|
|
|
|
# Insert datas to database
|
|
dictionary_copy = podaci.copy()
|
|
podaci_db.append(dictionary_copy)
|
|
|
|
|
|
|
|
|
|
# ------------- CREATE PANDAS DATAFRAME - DICTIONARY --------------
|
|
|
|
olx_db = pd.DataFrame(podaci_db) # treba biti niz
|
|
# print(olx_db)
|
|
|
|
olx_db.to_excel('test_teretna2.xlsx',index=False)
|
|
|
|
print("Zavrseno!!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|