Data scrape complete.

master
ben-burlingham 10 years ago
parent 86f4cafca5
commit f13b7d861a
  1. 2
      .gitignore
  2. 47
      scrape.js
  3. 48
      server.js
  4. 166
      server/assemble.js
  5. 14
      server/downloader.js
  6. 44
      server/io.js
  7. 165
      server/meteo.js
  8. 68
      server/meteorological.js
  9. 88
      server/noaa.js
  10. 118
      server/stations.js

2
.gitignore vendored

@ -1,3 +1,5 @@
experiments
d3.min.js
node_modules
data/meteo/txt
data/stations/xml

@ -1,47 +0,0 @@
var stations = require('./server/stations.js');
var meteo = require('./server/meteorological.js');
var Promise = require('es6-promise').Promise;
var chalk = require('chalk');
//===== Weather data scraper
// var date = new Date();
var len = stations.stations.length;
//===== Meteorological data scraping - careful!
(function scrapeMeteo(currentStation) {
if (currentStation !== 0) {
console.log(chalk.cyan('=== Finished.\n'));
}
if (currentStation >= len) {
// if (currentStation > 1) {
return;
}
var arr = [];
console.log(chalk.cyan('\n=== (' + currentStation + ') Starting ' + stations.stations[currentStation]));
// Monthly files
// for (var m = 0; m < 12; m++) {
// arr.push(meteo.getInconsistent(stations.stations[currentStation], m, 2015));
// arr.push(meteo.getMonth(stations.stations[currentStation], m));
// }
// TODO refactor to above format
// TODO refactor station requests into promises format
// for (var year = 1982; year < date.getFullYear() - 1; year++) {
// getYear(stationIDs[i], year);
// }
// getCurrent(stationIDs[i]);
// Wait for all to resolve, then recurse.
Promise.all(arr).then(scrapeMeteo.bind(null, currentStation + 1));
})(0);
//===== Station data scraping - shouldn't need to change often.
// stations.downloadAllMetadata();
// stations.parseAllMetadata();

@ -0,0 +1,48 @@
'use strict';
(function() {
var stations = require('./server/stations');
var meteo = require('./server/meteo');
var assemble = require('./server/assemble');
var IO = require('./server/io');
var chalk = require('chalk');
function next(index) {
if (index >= stations.ids.length) {
console.log(chalk.green("Finished."));
return;
}
console.log('=== Starting next promise chain.');
var station = 46026 //stations.ids[index];
Promise.resolve()
//===== Download
// .then(stations.getMetadata.bind(null, station))
// .then(meteo.getAllYears.bind(null, station, 1982, 2014))
// .then(meteo.getAllMonths.bind(null, station, 2015))
// .then(meteo.getAllNewest.bind(null, station, 2015))
//===== Parse
// .then(stations.parseStation.bind(null, station))
// .then(meteo.parseAllMonths.bind(null, 46026, 2015))
// .then(meteo.parseAllYears.bind(null, station, 1982, 2014))
//===== Assemble
.then(assemble.read.bind(null, station, 2015))
.then(assemble.getAverages)
// .then(function() { console.log('something') })
//===== Flow control
.then(function() { console.log('=== Chain complete.\n'); })
// .then(next.bind(null, index + 1))
.catch(IO.error);
};
// TODO remove previous years from file, such as 46026-2015 shouldn't have data from 2014.
next(0);
})();

@ -0,0 +1,166 @@
'use strict';
var IO = require('./io');
var meteo = require('./meteo');
// [
// {
// id: str
// name: str
// lat: str
// lon: str
// avg1982: {
// d: int[365] || null,
// w: int[52] || null,
// m: int[12] || null,
// y: int || null
// },
// avg1983: ...
// },
//
// {
// id: str
// ...
// }
// ]
module.exports = {
/**
*
*/
read: function(station, year) {
return IO.read(meteo.dirs.json + station + '-' + year + '.json')
.then(module.exports.parse);
},
/**
*
*/
parse: function(str) {
var json = {};
try {
json = JSON.parse(str);
} catch(e) {
IO.error(e);
}
return json;
},
/**
*
*/
getStation: function() {
},
/**
*
*/
getYearlyAverage: function(arr, col) {
var sum = 0;
var count = 0;
console.log('Yearly average for column ' + col + '.');
arr.forEach(function(row) {
sum += parseInt(row[col]);
count++;
});
var avg = Math.round((sum / count) * 10) / 10 || 0;
return avg;
},
/**
*
*/
getMonthlyAverages: function(arr, col) {
var sum, count;
var months = [];
var averages = [];
console.log('Monthly averages for column ' + col + '.');
for (var i = 0; i < 12; i++) {
months[i] = [];
}
// Assemble all the values for each month.
arr.forEach(function(row) {
months[row[1] - 1].push(row[col]);
});
// Get the average for each collection of values in each day of the year.
months.forEach(function(values, index) {
sum = 0;
count = 0;
values.map(function(val) {
sum += parseInt(val);
count++;
});
averages[index] = Math.round((sum / count) * 10) / 10 || 0;
});
return averages;
},
/**
*
*/
getDailyAverages: function(arr, col) {
var sum, count, a, b, doy;
var days = [];
var averages = [];
var dayms = 1000 * 60 * 60 * 24;
console.log('Daily averages for column ' + col + '.');
for (var i = 0; i <= 365; i++) {
days[i] = [];
}
// Assemble all the values for each day of the year.
arr.forEach(function(row) {
a = new Date(row[0], row[1] - 1, row[2]);
b = new Date(row[0], 0, 1);
doy = Math.ceil((a - b) / dayms);
days[doy].push(row[col]);
});
// Get the average for each collection of values in each day of the year.
days.forEach(function(values, index) {
sum = 0;
count = 0;
values.map(function(val) {
sum += parseInt(val);
count++;
});
averages[index] = Math.round((sum / count) * 10) / 10 || 0;
});
return averages;
},
/**
*
*/
getAverages: function(arr) {
var columnToAverage = 14;
// module.exports.getDailyAverages(arr, columnToAverage);
// module.exports.getMonthlyAverages(arr, columnToAverage);
module.exports.getYearlyAverage(arr, columnToAverage);
return null;
}
};

@ -2,14 +2,8 @@
var http = require('http');
var fs = require('fs');
var Promise = require('es6-promise').Promise;
module.exports = {
downloadedCount: 0,
downloadedSize: 0,
/**
*
*/
@ -34,12 +28,8 @@ module.exports = {
var pipe = response.pipe(file);
pipe.on('finish', function() {
module.exports.downloadedSize += pipe.bytesWritten;
module.exports.downloadedCount++;
console.log(url + ' --> ' + filename + ' --> ' + pipe.bytesWritten + ' bytes | ' + module.exports.downloadedSize + ' bytes total | ' + module.exports.downloadedCount + ' file(s)');
resolve(pipe.bytesWritten);
console.log('Download ' + filename + ' (' + pipe.bytesWritten + ' bytes)');
resolve();
});
});
});

@ -0,0 +1,44 @@
var fs = require('fs');
var chalk = require('chalk');
/**
* File IO, error reporting.
*/
module.exports = {
/**
*
*/
read: function(file, aaa, bbb) {
return new Promise(function(resolve) {
fs.readFile(file, 'utf8', function(err, str) {
console.log('Read ' + file);
module.exports.error(err);
resolve(str || '');
});
});
},
/**
*
*/
write: function(file, str) {
return new Promise(function(resolve) {
if (str) {
fs.writeFile(file, str, module.exports.error, resolve);
console.log('Write ' + file);
}
else {
resolve();
}
});
},
/**
*
*/
error: function(e) {
if (e !== null) {
console.log(chalk.yellow(e));
}
}
};

@ -0,0 +1,165 @@
'use strict'
var downloader = require('./downloader.js');
var IO = require('./io.js');
var NOAA = require('./noaa.js');
module.exports = {
//=========================
// Read-only vars
//=========================
dirs: {
txt: 'data/meteo/txt/',
json: 'data/meteo/json/'
},
months: [null, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
//==========================
// Downloads
//==========================
/**
*
*/
getYear: function(station, year) {
var path = module.exports.dirs.txt + station + '/';
var filename = station + 'h' + year + '.txt';
var url = 'http://www.ndbc.noaa.gov/view_text_file.php?'+
'filename=' + filename + '.gz&dir=data/historical/stdmet/';
downloader.mkdir(path);
return downloader.download(url, path + filename);
},
/**
*
*/
getMonth: function(station, month, year) {
var path = module.exports.dirs.txt + station + '/';
var filename = station + month.toString(16) + year + '.txt';
var url = 'http://www.ndbc.noaa.gov/view_text_file.php?' +
'filename=' + filename + '.gz&dir=data/stdmet/' + module.exports.months[month] + '/';
downloader.mkdir(path);
return downloader.download(url, path + filename);
},
/**
*
*/
getNewest: function(station, month, year) {
var path = module.exports.dirs.txt + station + '/';
var filename = station + month.toString(16) + year + '-newest.txt';
var url = 'http://www.ndbc.noaa.gov/data/stdmet/' + module.exports.months[month] + '/' + station + '.txt';
downloader.mkdir(path);
return downloader.download(url, path + filename);
},
/**
*
*/
getCurrent: function(station) {
var filename = station + '.txt';
var url = 'http://www.ndbc.noaa.gov/data/realtime2/' + filename;
return downloader.download(url, dir + filename);
},
//==========================
// Transformations
//==========================
/**
*
*/
parseAllMonths: function(station, year) {
var arr = [];
var txtPath = module.exports.dirs.txt + station + '/';
var jsonPath = module.exports.dirs.json + '/';
for (var month = 1; month <= 12; month++) {
arr.push(IO.read(txtPath + station + month.toString(16) + year + '-newest.txt').then(NOAA.parseTxt));
arr.push(IO.read(txtPath + station + month.toString(16) + year + '.txt').then(NOAA.parseTxt));
}
return Promise.all(arr)
.then(NOAA.aggregate)
.then(NOAA.convert)
.then(function(str) {
IO.write(jsonPath + station + '-' + year + '.json', str);
});
},
/**
*
*/
parseYear: function(station, year) {
var txtPath = module.exports.dirs.txt + station + '/';
var jsonPath = module.exports.dirs.json + '/';
return IO.read(txtPath + station + 'h' + year + '.txt')
.then(NOAA.parseTxt)
.then(NOAA.convert)
.then(function(str) {
IO.write(jsonPath + station + '-' + year + '.json', str);
});
},
//========================================
// Promise Collections used in loops
//========================================
/**
*
*/
getAllYears: function(station, startYear, endYear) {
var arr = [];
for (var year = startYear; year <= endYear; year++) {
arr.push(module.exports.getYear(station, year));
}
return Promise.all(arr);
},
/**
*
*/
getAllMonths: function(station, year) {
var arr = [];
for (var month = 1; month <= 12; month++) {
arr.push(module.exports.getMonth(station, month, year));
}
return Promise.all(arr);
},
/**
*
*/
getAllNewest: function(station, year) {
var arr = [];
for (var month = 1; month <= 12; month++) {
arr.push(module.exports.getNewest(station, month, year));
}
return Promise.all(arr);
},
/**
*
*/
parseAllYears: function(station, startYear, endYear) {
var arr = [];
for (var year = startYear; year <= endYear; year++) {
arr.push(module.exports.parseYear(station, year));
}
return Promise.all(arr);
},
};

@ -1,68 +0,0 @@
'use strict'
var downloader = require('./downloader.js');
var dir = 'data/meteorological/'
var Promise = require('es6-promise').Promise;
module.exports = {
months: ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
/**
*
*/
getYear: function(buoy, yyyy) {
var filename = buoy + 'h' + yyyy + '.txt';
var url = 'http://www.ndbc.noaa.gov/view_text_file.php?'+
'filename=' + filename + '.gz&dir=data/historical/stdmet/';
return downloader.download(url, dir + filename);
},
/**
*
*/
getMonth: function(buoy, m, yyyy) {
var month = m + 1;
month = (month == 10 ? 'a' : month);
month = (month == 11 ? 'b' : month);
month = (month == 12 ? 'c' : month);
var filename = buoy + month.toString() + yyyy + '.txt';
var url = 'http://www.ndbc.noaa.gov/view_text_file.php?' +
'filename=' + filename + '.gz&dir=data/stdmet/' + this.months[m] + '/';
var path = dir + buoy + '/';
downloader.mkdir(path);
return downloader.download(url, path + filename);
},
/**
*
*/
getCurrent: function(buoy) {
var filename = buoy + '.txt';
var url = 'http://www.ndbc.noaa.gov/data/realtime2/' + filename;
return downloader.download(url, dir + filename);
},
/**
*
*/
getInconsistent: function(buoy, m) {
var url = 'http://www.ndbc.noaa.gov/data/stdmet/' + this.months[m] + '/' + buoy + '.txt';
var month = m + 1;
month = (month == 10 ? 'a' : month);
month = (month == 11 ? 'b' : month);
month = (month == 12 ? 'c' : month);
var path = dir + buoy + '/';
downloader.mkdir(path);
return downloader.download(url, path + buoy.toString() + month + '2015-newest.txt');
}
};

@ -0,0 +1,88 @@
/**
* NOAA-specific filtering.
*/
module.exports = {
/**
*
*/
splitLine: function(str) {
var arr = str.split(/\s+/);
arr.filter(function(val) { return (val.length > 0); })
return arr;
},
/**
* Receives a stream from a file read event.
*/
parseTxt: function(str) {
console.log('Parsing NOAA space-delimited columnar data into JSON.');
var arr = [];
var cols = null;
var lines = str.split('\n');
var len = lines.length;
if (len > 8) {
for (var i = 0; i < len; i++) {
cols = module.exports.splitLine(lines[i]);
cols.length > 0 ? arr.push(cols) : null;
}
}
return arr;
},
/**
* After all files have been parsed, Promises.all passes them all as an array.
* This function does filtering on them and finalizes a JSON string.
*/
convert: function(arr) {
console.log('Converting aggregated month files to JSON.');
// Sort.
var sorted = arr.sort(function(a, b) {
var dateA = parseInt([a[0], a[1], a[2], a[3], ('00' + a[4]).substr(-2)].join('')) || 0;
var dateB = parseInt([b[0], b[1], b[2], b[3], ('00' + b[4]).substr(-2)].join('')) || 0;
return dateA - dateB;
});
// Filter for multiple headings/units rows.
var result = sorted.filter(function(row) {
return !(row[0] === '#YY' || row[0] === '#yr' || row.length === 1);
});
// Convert to JSON that can later be read easily.
var str = null;
if (result.length > 0) {
str = JSON.stringify(result)
str = str.replace(/\],\[/g, '],\n[');
}
return str;
},
/**
* Used to aggregate month files after they have been split into a lines array.
* Each line has been split into individual elements.
* The array passed to this function is therefore an array of two dimensional arrays.
*
* This function adds non-empty lines to a common result set.
*/
aggregate: function(arr) {
console.log('Aggregating month files for the year.');
var tmp = [];
arr.forEach(function(rows) {
if (rows.length === 0) {
return;
}
tmp = tmp.concat(rows);
});
return tmp;
}
};

@ -1,24 +1,29 @@
'use strict'
var fs = require('fs');
var downloader = require('./downloader');
var IO = require('./io');
var xml2js = require ('xml2js');
var dir = 'data/stations/';
// var dir = 'data/stations/';
module.exports = {
dirs: {
xml: 'data/stations/xml/',
json: 'data/stations/json/'
},
/**
* Add station IDs here, A-Z 0-9
*/
stations: [
'ANVC1',
'BDXC1',
'CECC1',
'CPXC1',
'HBYC1',
'ICAC1',
'NTBC1',
'PRYC1',
'PTGC1',
ids: [
// 'ANVC1',
// 'BDXC1',
// 'CECC1',
// 'CPXC1',
// 'HBYC1',
// 'ICAC1',
// 'NTBC1',
// 'PRYC1',
// 'PTGC1',
'46011',
'46012',
'46013',
@ -55,78 +60,37 @@ module.exports = {
/**
* Downloads each station's data XML.
*/
downloadAllMetadata: function() {
var len = this.stations.length;
var url;
for (var i = 0; i < len; i++) {
url = 'http://www.ndbc.noaa.gov/get_observation_as_xml.php?station=' + this.stations[i];
downloader.download(url, dir + this.stations[i] + '.txt');
}
getMetadata: function(station) {
var url = 'http://www.ndbc.noaa.gov/get_observation_as_xml.php?station=' + station;
return downloader.download(url, module.exports.dirs.xml + station + '.xml');
},
/**
*
*/
parseAllMetadata: function() {
function done() {
if (data.length === len) {
fs.writeFile(outfile, data.join('\n'), function(err) {
if (err) {
throw new Error(err)
}
console.log('Station data written to ' + outfile);
});
}
};
function next() {
// Wait for other concurrent files to finish.
if (count !== data.length) {
return;
}
var concurrent = 3;
var tmp;
for (var i = 0; i < concurrent; i++) {
tmp = count + i;
if (tmp === len) {
break;
parse: function(xml) {
return new Promise(function(resolve) {
xml2js.parseString(xml, function(err, json) {
var str = null;
// Do not stringify if null.
if (json) {
str = JSON.stringify(json.observation.$)
}
// console.log(tmp + "(" + count + ")" + " reading " + dir + module.exports.stations[tmp] + '.txt');
fs.readFile(dir + module.exports.stations[tmp] + '.txt', 'utf8', thenParse);
}
count += concurrent;
};
function thenParse(err, xml) {
if (err) {
throw new Error(err)
}
xml2js.parseString(xml, thenReport);
};
function thenReport(err, json) {
if (err) {
throw new Error(err)
}
data.push(JSON.stringify(json.observation.$));
next();
done();
};
var outfile = 'data-stations.json';
var len = module.exports.stations.length;
var count = 0;
var data = [];
resolve(str);
});
});
},
next();
/**
*
*/
parseStation: function(station) {
return IO.read(module.exports.dirs.xml + station + '.xml')
.then(module.exports.parse)
.then(function(str) {
IO.write(module.exports.dirs.json + station + '.json', str)
})
},
};

Loading…
Cancel
Save