import requests from bs4 import BeautifulSoup from googletrans import Translator from nltk import sent_tokenize import re # download url url = 'https://www.olx.ba/artikal/33001845/stan-u-centru-ilidza-sa-balkona/' res = requests.get(url) html_page = res.content # remove html soup = BeautifulSoup(html_page, 'html.parser') text = soup.find_all(text=True) output = '' blacklist = [ '[document]', 'noscript', 'header', 'html', 'meta', 'style', 'head', 'input', 'script', ] for t in text: if t.parent.name not in blacklist: output += '{}\n'.format(t) # mark local traits (currency etc) internationalized = re.sub(r'KM', '_currency_', output) print("======================================") # translate to english translator = Translator() en_output = translator.translate(internationalized).text sentences = sent_tokenize(en_output) no_whitespace = list(map(lambda s: re.sub(r'[^A-Za-z0-9\.,-_]+', ' ', s),sentences)) only_with_numbers = [sentence for sentence in no_whitespace if bool(re.search(r'\d', sentence))] lines = [sentence.split(' ') for sentence in only_with_numbers] features = [] for line in lines: for idx, word in enumerate(line): if bool(re.search(r'\d+', word)): features.append(list(filter(None,[ line[idx-1] if idx > 0 else None, line[idx], line[idx+1] if idx < len(line)-1 else None ]))) for feature in features: print(feature)