full filter
This commit is contained in:
61700
crawler/bardate_domains.txt
Normal file
61700
crawler/bardate_domains.txt
Normal file
File diff suppressed because it is too large
Load Diff
46129
crawler/bardate_domains_nu.txt
Normal file
46129
crawler/bardate_domains_nu.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,12 +1,18 @@
|
|||||||
var config = {};
|
var config = {};
|
||||||
|
|
||||||
|
/*
|
||||||
config.domainList = [
|
config.domainList = [
|
||||||
'https://www.iis.se/data/bardate_domains_nu.txt',
|
'https://www.iis.se/data/bardate_domains_nu.txt',
|
||||||
'https://www.iis.se/data/bardate_domains.txt'];
|
'https://www.iis.se/data/bardate_domains.txt'];
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
config.domainList = ['/home/bilal/Saburly/domene-svedska/crawler/bardate_domains.txt'];
|
||||||
|
|
||||||
config.wordList = __dirname + '/words.txt';
|
config.wordList = __dirname + '/words.txt';
|
||||||
|
|
||||||
config.lettersOnlyRegex = /^[A-Za-z]+$/;
|
config.lettersOnlyRegex = /^[A-Za-z]+$/;
|
||||||
|
config.swedishLettersOnly = /^[A-Za-zÅåÄäÖöüÜáÁèÈàÀéÉëËíÍÆæøØçÇ]+$/;
|
||||||
|
|
||||||
config.databaseURL = 'mongodb://localhost:27017/kivi';
|
config.databaseURL = 'mongodb://localhost:27017/kivi';
|
||||||
|
|
||||||
|
|||||||
@@ -15,17 +15,22 @@ MongoClient.connect(config.databaseURL).then(database => {
|
|||||||
config.words=[];
|
config.words=[];
|
||||||
}else{
|
}else{
|
||||||
config.words = data.split('\n');
|
config.words = data.split('\n');
|
||||||
|
let tmpWords = config.words.map((word,index)=>{
|
||||||
|
return word.toLowerCase();
|
||||||
|
});
|
||||||
|
config.words = tmpWords;
|
||||||
|
|
||||||
|
config.domainList.map(url=>{
|
||||||
|
//get domain list from url
|
||||||
|
links.getDomainList(url, (res)=>{
|
||||||
|
res.map(obj =>{
|
||||||
|
fs.appendFileSync('izlaz.txt',obj.domainName+'\n',err=>{console.log('er:' + err)})
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
config.domainList.map(url=>{
|
|
||||||
//get domain list from url
|
|
||||||
links.getDomainList(url, (res)=>{
|
|
||||||
res.map(obj =>{fs.appendFile('izlaz.txt',obj.domainName+'\n',err=>{console.log('er:' + err)})});
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
|
|
||||||
}).catch(reason=>{
|
}).catch(reason=>{
|
||||||
console.log("Error : " + reason);
|
console.log("Error : " + reason);
|
||||||
});
|
});
|
||||||
@@ -1,15 +1,19 @@
|
|||||||
const config = require('../config');
|
const config = require('../config');
|
||||||
const https = require("https");
|
const https = require("https");
|
||||||
|
const punycode = require('punycode');
|
||||||
|
var fs = require('fs');
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
getDomainList : function(url, callback){
|
getDomainList : function(url, callback){
|
||||||
getRawDomainList(url,(raw)=>{
|
getRawDomainList(url,(raw)=>{
|
||||||
|
|
||||||
let result = [];
|
let result = [];
|
||||||
raw.split('\n').map(domain=>{
|
raw.split('\n').map(domain=>{
|
||||||
let dot = domain.indexOf('.');
|
let unicodeDomain = punycode.toUnicode(domain);
|
||||||
|
let dot = unicodeDomain.indexOf('.');
|
||||||
if (dot !== -1){
|
if (dot !== -1){
|
||||||
let domainName = domain.substring(0,dot);
|
let domainName = unicodeDomain.substring(0,dot);
|
||||||
if (domainName.match(config.lettersOnlyRegex)){
|
if (domainName.match(config.swedishLettersOnly)){
|
||||||
//domain name contains only letters
|
//domain name contains only letters
|
||||||
//line in domain list is formatted as follows : [domain name]\t[expiration date]
|
//line in domain list is formatted as follows : [domain name]\t[expiration date]
|
||||||
result.push({domainName: domainName, expirationDate: domain.split('\t')[1]});
|
result.push({domainName: domainName, expirationDate: domain.split('\t')[1]});
|
||||||
@@ -17,6 +21,7 @@ module.exports = {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
applyFilter(result, callback);
|
applyFilter(result, callback);
|
||||||
|
console.log("Result Len : " + result.length);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -34,14 +39,25 @@ var applyFilter = function (domains, callback){
|
|||||||
}
|
}
|
||||||
|
|
||||||
var getRawDomainList = function (url, callback) {
|
var getRawDomainList = function (url, callback) {
|
||||||
https.get(url, res => {
|
if (url[0]==='/'){
|
||||||
res.setEncoding("utf8");
|
//it's local file
|
||||||
let body = "";
|
fs.readFile(url,'utf8',(err,data)=>{
|
||||||
res.on("data", data => {
|
if (err){
|
||||||
body += data;
|
console.log("err : " + err);
|
||||||
|
}else{
|
||||||
|
callback(data);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
res.on("end", () => {
|
}else{
|
||||||
callback(body);
|
https.get(url, res => {
|
||||||
|
res.setEncoding("utf8");
|
||||||
|
let body = "";
|
||||||
|
res.on("data", data => {
|
||||||
|
body += data;
|
||||||
|
});
|
||||||
|
res.on("end", () => {
|
||||||
|
callback(body);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
});
|
}
|
||||||
}
|
}
|
||||||
2041
crawler/izlaz.txt
2041
crawler/izlaz.txt
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,6 @@
|
|||||||
{
|
{
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"mongodb": "^2.2.33"
|
"mongodb": "^2.2.33",
|
||||||
|
"punycode": "^2.1.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
152890
crawler/words.txt
152890
crawler/words.txt
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user