Lowercase and fix bug with regular expression
This commit is contained in:
7
hamo.py
7
hamo.py
@@ -6,6 +6,9 @@ import re
|
|||||||
|
|
||||||
# download url
|
# download url
|
||||||
url = 'https://www.olx.ba/artikal/33001845/stan-u-centru-ilidza-sa-balkona/'
|
url = 'https://www.olx.ba/artikal/33001845/stan-u-centru-ilidza-sa-balkona/'
|
||||||
|
url = 'https://prostor.ba/prodaja/stan/dobrinja/9709'
|
||||||
|
|
||||||
|
|
||||||
res = requests.get(url)
|
res = requests.get(url)
|
||||||
html_page = res.content
|
html_page = res.content
|
||||||
|
|
||||||
@@ -39,8 +42,8 @@ print("======================================")
|
|||||||
# translate to english
|
# translate to english
|
||||||
translator = Translator()
|
translator = Translator()
|
||||||
en_output = translator.translate(internationalized).text
|
en_output = translator.translate(internationalized).text
|
||||||
sentences = sent_tokenize(en_output)
|
sentences = sent_tokenize(en_output.lower())
|
||||||
no_whitespace = list(map(lambda s: re.sub(r'[^A-Za-z0-9\.,-_]+', ' ', s),sentences))
|
no_whitespace = list(map(lambda s: re.sub(r'[^A-Za-z0-9\.,\-_]+', ' ', s),sentences))
|
||||||
|
|
||||||
only_with_numbers = [sentence for sentence in no_whitespace if bool(re.search(r'\d', sentence))]
|
only_with_numbers = [sentence for sentence in no_whitespace if bool(re.search(r'\d', sentence))]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user