Initial commit
This commit is contained in:
62
hamo.py
Normal file
62
hamo.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from googletrans import Translator
|
||||
from nltk import sent_tokenize
|
||||
import re
|
||||
|
||||
# download url
|
||||
url = 'https://www.olx.ba/artikal/33001845/stan-u-centru-ilidza-sa-balkona/'
|
||||
res = requests.get(url)
|
||||
html_page = res.content
|
||||
|
||||
# remove html
|
||||
soup = BeautifulSoup(html_page, 'html.parser')
|
||||
text = soup.find_all(text=True)
|
||||
|
||||
output = ''
|
||||
blacklist = [
|
||||
'[document]',
|
||||
'noscript',
|
||||
'header',
|
||||
'html',
|
||||
'meta',
|
||||
'style',
|
||||
'head',
|
||||
'input',
|
||||
'script',
|
||||
]
|
||||
|
||||
for t in text:
|
||||
if t.parent.name not in blacklist:
|
||||
output += '{}\n'.format(t)
|
||||
|
||||
|
||||
# mark local traits (currency etc)
|
||||
|
||||
internationalized = re.sub(r'KM', '_currency_', output)
|
||||
|
||||
print("======================================")
|
||||
# translate to english
|
||||
translator = Translator()
|
||||
en_output = translator.translate(internationalized).text
|
||||
sentences = sent_tokenize(en_output)
|
||||
no_whitespace = list(map(lambda s: re.sub(r'[^A-Za-z0-9\.,-_]+', ' ', s),sentences))
|
||||
|
||||
only_with_numbers = [sentence for sentence in no_whitespace if bool(re.search(r'\d', sentence))]
|
||||
|
||||
lines = [sentence.split(' ') for sentence in only_with_numbers]
|
||||
|
||||
features = []
|
||||
for line in lines:
|
||||
for idx, word in enumerate(line):
|
||||
if bool(re.search(r'\d+', word)):
|
||||
features.append(list(filter(None,[
|
||||
line[idx-1] if idx > 0 else None,
|
||||
line[idx],
|
||||
line[idx+1] if idx < len(line)-1 else None
|
||||
])))
|
||||
|
||||
|
||||
for feature in features:
|
||||
print(feature)
|
||||
|
||||
Reference in New Issue
Block a user