This commit is contained in:
GotPPay
2017-12-07 12:36:41 +01:00
parent 229b90495d
commit ab51d436ce
10 changed files with 223757 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
node_modules

View File

@@ -1,2 +1,7 @@
# domene-svedska
Elman domene svedska projekat
cd crawler
npm install
node crawler.js

5
backend/config.js Normal file
View File

@@ -0,0 +1,5 @@
var config = {};
config.PORT = 3000;
module.exports = config;

6
backend/server.js Normal file
View File

@@ -0,0 +1,6 @@
const express = require('express');
const config = require('./config');
const app = express();
app.listen(config.PORT, ()=>{console.log('Server running on port ' + config.PORT)});

17
crawler/config.js Normal file
View File

@@ -0,0 +1,17 @@
var config = {};
config.domainList = [
'https://www.iis.se/data/bardate_domains_nu.txt',
'https://www.iis.se/data/bardate_domains.txt'];
config.wordList = __dirname + '/words.txt';
config.lettersOnlyRegex = /^[A-Za-z]+$/;
config.databaseURL = 'mongodb://localhost:27017/kivi';
//====
config.words = [];
module.exports = config;

31
crawler/crawler.js Normal file
View File

@@ -0,0 +1,31 @@
var config = require('./config');
const links = require('./helper/links');
var MongoClient = require ('mongodb').MongoClient;
var ObjectID = require ('mongodb').ObjectID;
var fs = require('fs');
MongoClient.connect(config.databaseURL).then(database => {
let db = database;
//db.collection ('yesterday').createIndex ({domainName: 'text'});
//Get word list into memory
fs.readFile(config.wordList, 'utf8', (err,data)=>{
if (err){
console.log(err);
config.words=[];
}else{
config.words = data.split('\n');
}
});
config.domainList.map(url=>{
//get domain list from url
links.getDomainList(url, (res)=>{
//res.map(obj =>{fs.appendFile('izlaz.txt',obj.domainName+'\n',err=>{console.log('er:' + err)})});
});
});
}).catch(reason=>{
console.log("Error : " + reason);
});

47
crawler/helper/links.js Normal file
View File

@@ -0,0 +1,47 @@
const config = require('../config');
const https = require("https");
module.exports = {
getDomainList : function(url, callback){
getRawDomainList(url,(raw)=>{
let result = [];
raw.split('\n').map(domain=>{
let dot = domain.indexOf('.');
if (dot !== -1){
let domainName = domain.substring(0,dot);
if (domainName.match(config.lettersOnlyRegex)){
//domain name contains only letters
//line in domain list is formatted as follows : [domain name]\t[expiration date]
result.push({domainName: domainName, expirationDate: domain.split('\t')[1]});
}
}
});
applyFilter(result, callback);
});
}
};
var applyFilter = function (domains, callback){
//get domain names that only match whole words
let result = [];
domains.map(domain=>{
let index = config.words.indexOf(domain.domainName);
if (index !== -1){
result.push(domain);
}
});
callback(result);
}
var getRawDomainList = function (url, callback) {
https.get(url, res => {
res.setEncoding("utf8");
let body = "";
res.on("data", data => {
body += data;
});
res.on("end", () => {
callback(body);
});
});
}

2041
crawler/izlaz.txt Normal file

File diff suppressed because it is too large Load Diff

5
crawler/package.json Normal file
View File

@@ -0,0 +1,5 @@
{
"dependencies": {
"mongodb": "^2.2.33"
}
}

221599
crawler/words.txt Normal file

File diff suppressed because it is too large Load Diff