full filter
This commit is contained in:
61700
crawler/bardate_domains.txt
Normal file
61700
crawler/bardate_domains.txt
Normal file
File diff suppressed because it is too large
Load Diff
46129
crawler/bardate_domains_nu.txt
Normal file
46129
crawler/bardate_domains_nu.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,12 +1,18 @@
|
||||
var config = {};
|
||||
|
||||
/*
|
||||
config.domainList = [
|
||||
'https://www.iis.se/data/bardate_domains_nu.txt',
|
||||
'https://www.iis.se/data/bardate_domains.txt'];
|
||||
|
||||
*/
|
||||
|
||||
config.domainList = ['/home/bilal/Saburly/domene-svedska/crawler/bardate_domains.txt'];
|
||||
|
||||
config.wordList = __dirname + '/words.txt';
|
||||
|
||||
config.lettersOnlyRegex = /^[A-Za-z]+$/;
|
||||
config.swedishLettersOnly = /^[A-Za-zÅåÄäÖöüÜáÁèÈàÀéÉëËíÍÆæøØçÇ]+$/;
|
||||
|
||||
config.databaseURL = 'mongodb://localhost:27017/kivi';
|
||||
|
||||
|
||||
@@ -14,17 +14,22 @@ MongoClient.connect(config.databaseURL).then(database => {
|
||||
console.log(err);
|
||||
config.words=[];
|
||||
}else{
|
||||
config.words = data.split('\n');
|
||||
config.words = data.split('\n');
|
||||
let tmpWords = config.words.map((word,index)=>{
|
||||
return word.toLowerCase();
|
||||
});
|
||||
config.words = tmpWords;
|
||||
|
||||
config.domainList.map(url=>{
|
||||
//get domain list from url
|
||||
links.getDomainList(url, (res)=>{
|
||||
res.map(obj =>{
|
||||
fs.appendFileSync('izlaz.txt',obj.domainName+'\n',err=>{console.log('er:' + err)})
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
config.domainList.map(url=>{
|
||||
//get domain list from url
|
||||
links.getDomainList(url, (res)=>{
|
||||
res.map(obj =>{fs.appendFile('izlaz.txt',obj.domainName+'\n',err=>{console.log('er:' + err)})});
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
}).catch(reason=>{
|
||||
console.log("Error : " + reason);
|
||||
|
||||
@@ -1,15 +1,19 @@
|
||||
const config = require('../config');
|
||||
const https = require("https");
|
||||
const punycode = require('punycode');
|
||||
var fs = require('fs');
|
||||
|
||||
module.exports = {
|
||||
getDomainList : function(url, callback){
|
||||
getRawDomainList(url,(raw)=>{
|
||||
|
||||
let result = [];
|
||||
raw.split('\n').map(domain=>{
|
||||
let dot = domain.indexOf('.');
|
||||
let unicodeDomain = punycode.toUnicode(domain);
|
||||
let dot = unicodeDomain.indexOf('.');
|
||||
if (dot !== -1){
|
||||
let domainName = domain.substring(0,dot);
|
||||
if (domainName.match(config.lettersOnlyRegex)){
|
||||
let domainName = unicodeDomain.substring(0,dot);
|
||||
if (domainName.match(config.swedishLettersOnly)){
|
||||
//domain name contains only letters
|
||||
//line in domain list is formatted as follows : [domain name]\t[expiration date]
|
||||
result.push({domainName: domainName, expirationDate: domain.split('\t')[1]});
|
||||
@@ -17,6 +21,7 @@ module.exports = {
|
||||
}
|
||||
});
|
||||
applyFilter(result, callback);
|
||||
console.log("Result Len : " + result.length);
|
||||
});
|
||||
}
|
||||
};
|
||||
@@ -34,14 +39,25 @@ var applyFilter = function (domains, callback){
|
||||
}
|
||||
|
||||
var getRawDomainList = function (url, callback) {
|
||||
https.get(url, res => {
|
||||
res.setEncoding("utf8");
|
||||
let body = "";
|
||||
res.on("data", data => {
|
||||
body += data;
|
||||
if (url[0]==='/'){
|
||||
//it's local file
|
||||
fs.readFile(url,'utf8',(err,data)=>{
|
||||
if (err){
|
||||
console.log("err : " + err);
|
||||
}else{
|
||||
callback(data);
|
||||
}
|
||||
});
|
||||
res.on("end", () => {
|
||||
callback(body);
|
||||
}else{
|
||||
https.get(url, res => {
|
||||
res.setEncoding("utf8");
|
||||
let body = "";
|
||||
res.on("data", data => {
|
||||
body += data;
|
||||
});
|
||||
res.on("end", () => {
|
||||
callback(body);
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
2041
crawler/izlaz.txt
2041
crawler/izlaz.txt
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,6 @@
|
||||
{
|
||||
"dependencies": {
|
||||
"mongodb": "^2.2.33"
|
||||
"mongodb": "^2.2.33",
|
||||
"punycode": "^2.1.0"
|
||||
}
|
||||
}
|
||||
|
||||
152890
crawler/words.txt
152890
crawler/words.txt
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user