From 0b7fbc6b72f4fcbf8b0d6ffd0081507c083fae97 Mon Sep 17 00:00:00 2001 From: GotPPay Date: Tue, 12 Dec 2017 22:08:02 +0100 Subject: [PATCH] code fix --- crawler/bardate_domains_10_12.txt | 2 +- crawler/bardate_domains_11_12.txt | 1 - crawler/crawler.js | 72 +++++---------- crawler/helper/database.js | 31 +++++++ crawler/helper/links.js | 147 ++++++++++++++++++++---------- 5 files changed, 154 insertions(+), 99 deletions(-) create mode 100644 crawler/helper/database.js diff --git a/crawler/bardate_domains_10_12.txt b/crawler/bardate_domains_10_12.txt index 4b2417a..b358c0f 100644 --- a/crawler/bardate_domains_10_12.txt +++ b/crawler/bardate_domains_10_12.txt @@ -1248,7 +1248,7 @@ agag.se 2018-01-15 agagil.se 2018-02-13 agal.se 2018-01-15 agaler.se 2018-02-13 -agan.se 2018-01-15 +agan.se 2017-12-12 agapolitikern.se 2018-01-08 agarum.se 2017-12-30 agathared.se 2017-12-29 diff --git a/crawler/bardate_domains_11_12.txt b/crawler/bardate_domains_11_12.txt index 65a486c..e988e77 100644 --- a/crawler/bardate_domains_11_12.txt +++ b/crawler/bardate_domains_11_12.txt @@ -1252,7 +1252,6 @@ agag.se 2018-01-15 agagil.se 2018-02-13 agal.se 2018-01-15 agaler.se 2018-02-13 -agan.se 2018-01-15 agapolitikern.se 2018-01-08 agarum.se 2017-12-30 agathared.se 2017-12-29 diff --git a/crawler/crawler.js b/crawler/crawler.js index b8ea711..6de7c36 100644 --- a/crawler/crawler.js +++ b/crawler/crawler.js @@ -1,14 +1,15 @@ var config = require('./config'); const links = require('./helper/links'); +const database = require('./helper/database'); var MongoClient = require ('mongodb').MongoClient; var ObjectID = require ('mongodb').ObjectID; var fs = require('fs'); var datetime = require('node-datetime'); var http = require('http'); -const punycode = require('punycode'); -MongoClient.connect(config.databaseURL).then(database => { - let db = database; + +MongoClient.connect(config.databaseURL).then(mongoDatabase => { + let db = mongoDatabase; db.executeDbAdminCommand( { setParameter: true, textSearchEnabled : true}); db.collection('expired_list').createIndex({domainName: 'text'}, {unique: true}); @@ -30,55 +31,30 @@ MongoClient.connect(config.databaseURL).then(database => { config.domainList.map(url=>{ - //get domain list from url - links.getDomainList(url, (res)=>{ - db.collection('today').insert(res,()=>{ - //insertion done, compare domains with yesterday - db.collection('yesterday').find({}).toArray((err,result)=>{ - if (err){ - console.log("Error : " + err); - }else{ - result.map((domain)=>{ - db.collection('today').findOne({domainName:domain.domainName}, (err,result)=>{ - if (result===null){ - if (datetime.create().format('Y-m-d')===domain.expirationDate){ - db.collection('expired_list').insert(domain); - } - } - }); + links.getDomainList(url).then(res=>{ + database.insertTodayDomains(db,res).then(()=>{ + database.getYesterdayDomains(db).then((result)=>{ + + result.map((domain)=>{ + db.collection('today').findOne({domainName:domain.domainName}, (err,result)=>{ + if ((result===null) &&(datetime.create().format('Y-m-d')===domain.expirationDate)){ + db.collection('expired_list').insert(domain); + } }); - db.collection('expired_list').find({}).toArray((err,result)=>{ - result.map(domain=>{ - let checkLink = ''; - switch(domain.tld){ - case 'se': - checkLink = config.seDomainCheck; - break; - case 'nu': - checkLink = config.nuDomainCheck; - break; - } - let fullName = domain.domainName + '.' + domain.tld; - http.get(checkLink+punycode.toASCII(fullName), res => { - res.setEncoding("utf8"); - let body = ""; - res.on("data", data => { - body += data; - }); - res.on("end", () => { - let status = body.split(' ')[0]; - if (status !== 'free'){ - db.collection('expired_list').remove({domainName:domain.domainName}); - } - }); - }); - }); - process.exit(0); + }); + + database.getExpiredDomains(db).then((result)=>{ + links.checkExpiredDomains(db,result).then(()=>{ + console.log("Done"); + db.close(); }); - } + }); + + }).catch((e)=>{ + console.log(e); }); }); - }); + }); }); } }); diff --git a/crawler/helper/database.js b/crawler/helper/database.js new file mode 100644 index 0000000..88be460 --- /dev/null +++ b/crawler/helper/database.js @@ -0,0 +1,31 @@ +module.exports = { + insertTodayDomains : function (db, domains){ + return new Promise((resolve,reject)=>{ + db.collection('today').insert(domains, ()=>{ + resolve(); + }); + }); + }, + + getYesterdayDomains : function (db){ + return getArrayFromDatabase(db, 'yesterday'); + }, + + getExpiredDomains : function (db){ + return getArrayFromDatabase(db, 'expired_list'); + } + + +}; + +const getArrayFromDatabase = function(db, collection){ + return new Promise((resolve,reject)=>{ + db.collection(collection).find({}).toArray((err,result)=>{ + if (err){ + reject('Error reading collection'); + }else{ + resolve(result); + } + }); + }); +} \ No newline at end of file diff --git a/crawler/helper/links.js b/crawler/helper/links.js index 5ec10de..fd933de 100644 --- a/crawler/helper/links.js +++ b/crawler/helper/links.js @@ -4,61 +4,110 @@ const punycode = require('punycode'); var fs = require('fs'); module.exports = { - getDomainList : function(url, callback){ - - getRawDomainList(url,(raw)=>{ - let result = []; - raw.split('\n').map(domain=>{ - let unicodeDomain = punycode.toUnicode(domain); - let dot = unicodeDomain.indexOf('.'); - let tab = unicodeDomain.indexOf('\t'); - if (dot !== -1){ - let domainName = unicodeDomain.substring(0,dot); - let tld = unicodeDomain.substring(dot+1,tab); - if (domainName.match(config.swedishLettersOnly)){ - //domain name contains only letters - //line in domain list is formatted as follows : [domain name]\t[expiration date] - result.push({domainName: domainName, tld:tld ,expirationDate: domain.split('\t')[1]}); - } - } + getDomainList : function(url){ + return new Promise((resolve, reject)=>{ + getRawDomainList(url).then(raw=>{ + processDomains(raw).then(result=>{ + applyFilter(result).then(result=>{ + resolve(result); + }) + }); }); - applyFilter(result, callback); + }); + }, + + checkExpiredDomains : function(db, domains){ + return new Promise((resolve,reject)=>{ + domains.map(domain=>{ + let checkLink = ''; + switch(domain.tld){ + case 'se': + checkLink = config.seDomainCheck; + break; + case 'nu': + checkLink = config.nuDomainCheck; + break; + } + + let fullName = domain.domainName + '.' + domain.tld; + http.get(checkLink+punycode.toASCII(fullName), res => { + res.setEncoding("utf8"); + let body = ""; + res.on("data", data => { + body += data; + }); + res.on("end", () => { + let status = body.split(' ')[0]; + if (status !== 'free'){ + db.collection('expired_list').remove({domainName:domain.domainName}); + } + }); + }); + }); + resolve(); }); } + + }; -var applyFilter = function (domains, callback){ - //get domain names that only match whole words - let result = []; - domains.map(domain=>{ - let index = config.words.indexOf(domain.domainName); - if (index !== -1){ - result.push(domain); - } - }); - callback(result); -} - -var getRawDomainList = function (url, callback) { - if (url[0]==='/'){ - //it's local file - fs.readFile(url,'utf8',(err,data)=>{ - if (err){ - console.log("err : " + err); - }else{ - callback(data); +var applyFilter = function (domains){ + return new Promise((resolve,reject)=>{ + //get domain names that only match whole words + let result = []; + domains.map(domain=>{ + let index = config.words.indexOf(domain.domainName); + if (index !== -1){ + result.push(domain); } }); - }else{ - https.get(url, res => { - res.setEncoding("utf8"); - let body = ""; - res.on("data", data => { - body += data; - }); - res.on("end", () => { - callback(body); - }); + resolve(result); + }); +} + +var processDomains = function(raw){ + return new Promise((resolve,reject)=>{ + let result = []; + raw.split('\n').map(domain=>{ + let unicodeDomain = punycode.toUnicode(domain); + let dot = unicodeDomain.indexOf('.'); + let tab = unicodeDomain.indexOf('\t'); + if (dot !== -1){ + let domainName = unicodeDomain.substring(0,dot); + let tld = unicodeDomain.substring(dot+1,tab); + if (domainName.match(config.swedishLettersOnly)){ + //domain name contains only letters + //line in domain list is formatted as follows : [domain name]\t[expiration date] + result.push({domainName: domainName, tld:tld ,expirationDate: domain.split('\t')[1]}); + } + } }); - } + resolve(result); + }); +} + +var getRawDomainList = function (url) { + return new Promise((resolve, reject)=>{ + if (url[0]==='/'){ + //it's local file + fs.readFile(url,'utf8',(err,data)=>{ + if (err){ + reject(err); + }else{ + resolve(data); + } + }); + }else{ + https.get(url, res => { + res.setEncoding("utf8"); + let body = ""; + res.on("data", data => { + body += data; + }); + res.on("end", () => { + resolve(body); + }); + }); + } + }); } \ No newline at end of file