Files
old-domene-svedska/crawler/helper/links.js

63 lines
1.9 KiB
JavaScript
Raw Normal View History

2017-12-07 12:36:41 +01:00
const config = require('../config');
const https = require("https");
2017-12-08 10:57:47 +01:00
const punycode = require('punycode');
var fs = require('fs');
2017-12-07 12:36:41 +01:00
module.exports = {
getDomainList : function(url, callback){
getRawDomainList(url,(raw)=>{
2017-12-08 10:57:47 +01:00
2017-12-07 12:36:41 +01:00
let result = [];
raw.split('\n').map(domain=>{
2017-12-08 10:57:47 +01:00
let unicodeDomain = punycode.toUnicode(domain);
let dot = unicodeDomain.indexOf('.');
2017-12-07 12:36:41 +01:00
if (dot !== -1){
2017-12-08 10:57:47 +01:00
let domainName = unicodeDomain.substring(0,dot);
if (domainName.match(config.swedishLettersOnly)){
2017-12-07 12:36:41 +01:00
//domain name contains only letters
//line in domain list is formatted as follows : [domain name]\t[expiration date]
result.push({domainName: domainName, expirationDate: domain.split('\t')[1]});
}
}
});
applyFilter(result, callback);
2017-12-08 10:57:47 +01:00
console.log("Result Len : " + result.length);
2017-12-07 12:36:41 +01:00
});
}
};
var applyFilter = function (domains, callback){
//get domain names that only match whole words
let result = [];
domains.map(domain=>{
let index = config.words.indexOf(domain.domainName);
if (index !== -1){
result.push(domain);
}
});
callback(result);
}
var getRawDomainList = function (url, callback) {
2017-12-08 10:57:47 +01:00
if (url[0]==='/'){
//it's local file
fs.readFile(url,'utf8',(err,data)=>{
if (err){
console.log("err : " + err);
}else{
callback(data);
}
2017-12-07 12:36:41 +01:00
});
2017-12-08 10:57:47 +01:00
}else{
https.get(url, res => {
res.setEncoding("utf8");
let body = "";
res.on("data", data => {
body += data;
});
res.on("end", () => {
callback(body);
});
2017-12-07 12:36:41 +01:00
});
2017-12-08 10:57:47 +01:00
}
2017-12-07 12:36:41 +01:00
}