Files
old-nlp/hamo.py
2019-10-20 13:18:54 +02:00

63 lines
1.5 KiB
Python

import requests
from bs4 import BeautifulSoup
from googletrans import Translator
from nltk import sent_tokenize
import re
# download url
url = 'https://www.olx.ba/artikal/33001845/stan-u-centru-ilidza-sa-balkona/'
res = requests.get(url)
html_page = res.content
# remove html
soup = BeautifulSoup(html_page, 'html.parser')
text = soup.find_all(text=True)
output = ''
blacklist = [
'[document]',
'noscript',
'header',
'html',
'meta',
'style',
'head',
'input',
'script',
]
for t in text:
if t.parent.name not in blacklist:
output += '{}\n'.format(t)
# mark local traits (currency etc)
internationalized = re.sub(r'KM', '_currency_', output)
print("======================================")
# translate to english
translator = Translator()
en_output = translator.translate(internationalized).text
sentences = sent_tokenize(en_output)
no_whitespace = list(map(lambda s: re.sub(r'[^A-Za-z0-9\.,-_]+', ' ', s),sentences))
only_with_numbers = [sentence for sentence in no_whitespace if bool(re.search(r'\d', sentence))]
lines = [sentence.split(' ') for sentence in only_with_numbers]
features = []
for line in lines:
for idx, word in enumerate(line):
if bool(re.search(r'\d+', word)):
features.append(list(filter(None,[
line[idx-1] if idx > 0 else None,
line[idx],
line[idx+1] if idx < len(line)-1 else None
])))
for feature in features:
print(feature)