diff --git a/hamo.py b/hamo.py index debd2cf..c05cb76 100644 --- a/hamo.py +++ b/hamo.py @@ -6,6 +6,9 @@ import re # download url url = 'https://www.olx.ba/artikal/33001845/stan-u-centru-ilidza-sa-balkona/' +url = 'https://prostor.ba/prodaja/stan/dobrinja/9709' + + res = requests.get(url) html_page = res.content @@ -39,8 +42,8 @@ print("======================================") # translate to english translator = Translator() en_output = translator.translate(internationalized).text -sentences = sent_tokenize(en_output) -no_whitespace = list(map(lambda s: re.sub(r'[^A-Za-z0-9\.,-_]+', ' ', s),sentences)) +sentences = sent_tokenize(en_output.lower()) +no_whitespace = list(map(lambda s: re.sub(r'[^A-Za-z0-9\.,\-_]+', ' ', s),sentences)) only_with_numbers = [sentence for sentence in no_whitespace if bool(re.search(r'\d', sentence))]