From f13b7d861afdeca4836592b7b418527d8a0091cd Mon Sep 17 00:00:00 2001 From: ben-burlingham Date: Sun, 17 Jan 2016 17:55:48 -0800 Subject: [PATCH] Data scrape complete. --- .gitignore | 2 + scrape.js | 47 ----------- server.js | 48 +++++++++++ server/assemble.js | 166 +++++++++++++++++++++++++++++++++++++++ server/downloader.js | 14 +--- server/io.js | 44 +++++++++++ server/meteo.js | 165 ++++++++++++++++++++++++++++++++++++++ server/meteorological.js | 68 ---------------- server/noaa.js | 88 +++++++++++++++++++++ server/stations.js | 118 ++++++++++------------------ 10 files changed, 556 insertions(+), 204 deletions(-) delete mode 100644 scrape.js create mode 100644 server.js create mode 100644 server/assemble.js create mode 100644 server/io.js create mode 100644 server/meteo.js delete mode 100644 server/meteorological.js create mode 100644 server/noaa.js diff --git a/.gitignore b/.gitignore index 35b9803..a86bb1f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ experiments d3.min.js node_modules +data/meteo/txt +data/stations/xml diff --git a/scrape.js b/scrape.js deleted file mode 100644 index 910a9a9..0000000 --- a/scrape.js +++ /dev/null @@ -1,47 +0,0 @@ -var stations = require('./server/stations.js'); -var meteo = require('./server/meteorological.js'); -var Promise = require('es6-promise').Promise; -var chalk = require('chalk'); - -//===== Weather data scraper -// var date = new Date(); - -var len = stations.stations.length; - -//===== Meteorological data scraping - careful! -(function scrapeMeteo(currentStation) { - if (currentStation !== 0) { - console.log(chalk.cyan('=== Finished.\n')); - } - - if (currentStation >= len) { - // if (currentStation > 1) { - return; - } - - var arr = []; - - console.log(chalk.cyan('\n=== (' + currentStation + ') Starting ' + stations.stations[currentStation])); - - // Monthly files - // for (var m = 0; m < 12; m++) { - // arr.push(meteo.getInconsistent(stations.stations[currentStation], m, 2015)); - // arr.push(meteo.getMonth(stations.stations[currentStation], m)); - // } - - // TODO refactor to above format - // TODO refactor station requests into promises format - // for (var year = 1982; year < date.getFullYear() - 1; year++) { - // getYear(stationIDs[i], year); - // } - - // getCurrent(stationIDs[i]); - - // Wait for all to resolve, then recurse. - Promise.all(arr).then(scrapeMeteo.bind(null, currentStation + 1)); - -})(0); - -//===== Station data scraping - shouldn't need to change often. -// stations.downloadAllMetadata(); -// stations.parseAllMetadata(); diff --git a/server.js b/server.js new file mode 100644 index 0000000..5d83f4b --- /dev/null +++ b/server.js @@ -0,0 +1,48 @@ +'use strict'; + +(function() { + var stations = require('./server/stations'); + var meteo = require('./server/meteo'); + var assemble = require('./server/assemble'); + var IO = require('./server/io'); + var chalk = require('chalk'); + + function next(index) { + if (index >= stations.ids.length) { + console.log(chalk.green("Finished.")); + return; + } + + console.log('=== Starting next promise chain.'); + + var station = 46026 //stations.ids[index]; + + Promise.resolve() + //===== Download + // .then(stations.getMetadata.bind(null, station)) + // .then(meteo.getAllYears.bind(null, station, 1982, 2014)) + // .then(meteo.getAllMonths.bind(null, station, 2015)) + // .then(meteo.getAllNewest.bind(null, station, 2015)) + + //===== Parse + // .then(stations.parseStation.bind(null, station)) + // .then(meteo.parseAllMonths.bind(null, 46026, 2015)) + // .then(meteo.parseAllYears.bind(null, station, 1982, 2014)) + + //===== Assemble + .then(assemble.read.bind(null, station, 2015)) + .then(assemble.getAverages) + + // .then(function() { console.log('something') }) + + //===== Flow control + .then(function() { console.log('=== Chain complete.\n'); }) + // .then(next.bind(null, index + 1)) + .catch(IO.error); + + }; + + // TODO remove previous years from file, such as 46026-2015 shouldn't have data from 2014. + + next(0); +})(); diff --git a/server/assemble.js b/server/assemble.js new file mode 100644 index 0000000..bd4f479 --- /dev/null +++ b/server/assemble.js @@ -0,0 +1,166 @@ +'use strict'; + +var IO = require('./io'); +var meteo = require('./meteo'); + +// [ +// { +// id: str +// name: str +// lat: str +// lon: str + +// avg1982: { +// d: int[365] || null, +// w: int[52] || null, +// m: int[12] || null, +// y: int || null +// }, + +// avg1983: ... +// }, +// +// { +// id: str +// ... +// } +// ] + + +module.exports = { + /** + * + */ + read: function(station, year) { + return IO.read(meteo.dirs.json + station + '-' + year + '.json') + .then(module.exports.parse); + }, + + /** + * + */ + parse: function(str) { + var json = {}; + + try { + json = JSON.parse(str); + } catch(e) { + IO.error(e); + } + + return json; + }, + + /** + * + */ + getStation: function() { + + }, + + /** + * + */ + getYearlyAverage: function(arr, col) { + var sum = 0; + var count = 0; + + console.log('Yearly average for column ' + col + '.'); + + arr.forEach(function(row) { + sum += parseInt(row[col]); + count++; + }); + + var avg = Math.round((sum / count) * 10) / 10 || 0; + return avg; + }, + + /** + * + */ + getMonthlyAverages: function(arr, col) { + var sum, count; + var months = []; + var averages = []; + + console.log('Monthly averages for column ' + col + '.'); + + for (var i = 0; i < 12; i++) { + months[i] = []; + } + + // Assemble all the values for each month. + arr.forEach(function(row) { + months[row[1] - 1].push(row[col]); + }); + + // Get the average for each collection of values in each day of the year. + months.forEach(function(values, index) { + sum = 0; + count = 0; + + values.map(function(val) { + sum += parseInt(val); + count++; + }); + + averages[index] = Math.round((sum / count) * 10) / 10 || 0; + }); + + return averages; + }, + + /** + * + */ + getDailyAverages: function(arr, col) { + var sum, count, a, b, doy; + var days = []; + var averages = []; + var dayms = 1000 * 60 * 60 * 24; + + console.log('Daily averages for column ' + col + '.'); + + for (var i = 0; i <= 365; i++) { + days[i] = []; + } + + // Assemble all the values for each day of the year. + arr.forEach(function(row) { + a = new Date(row[0], row[1] - 1, row[2]); + b = new Date(row[0], 0, 1); + doy = Math.ceil((a - b) / dayms); + + days[doy].push(row[col]); + }); + + // Get the average for each collection of values in each day of the year. + days.forEach(function(values, index) { + sum = 0; + count = 0; + + values.map(function(val) { + sum += parseInt(val); + count++; + }); + + averages[index] = Math.round((sum / count) * 10) / 10 || 0; + }); + + return averages; + }, + + /** + * + */ + getAverages: function(arr) { + var columnToAverage = 14; + + // module.exports.getDailyAverages(arr, columnToAverage); + // module.exports.getMonthlyAverages(arr, columnToAverage); + module.exports.getYearlyAverage(arr, columnToAverage); + + return null; + } +}; diff --git a/server/downloader.js b/server/downloader.js index 0afa0e4..e9ab245 100644 --- a/server/downloader.js +++ b/server/downloader.js @@ -2,14 +2,8 @@ var http = require('http'); var fs = require('fs'); -var Promise = require('es6-promise').Promise; module.exports = { - - downloadedCount: 0, - - downloadedSize: 0, - /** * */ @@ -34,12 +28,8 @@ module.exports = { var pipe = response.pipe(file); pipe.on('finish', function() { - module.exports.downloadedSize += pipe.bytesWritten; - module.exports.downloadedCount++; - - console.log(url + ' --> ' + filename + ' --> ' + pipe.bytesWritten + ' bytes | ' + module.exports.downloadedSize + ' bytes total | ' + module.exports.downloadedCount + ' file(s)'); - - resolve(pipe.bytesWritten); + console.log('Download ' + filename + ' (' + pipe.bytesWritten + ' bytes)'); + resolve(); }); }); }); diff --git a/server/io.js b/server/io.js new file mode 100644 index 0000000..f644015 --- /dev/null +++ b/server/io.js @@ -0,0 +1,44 @@ +var fs = require('fs'); +var chalk = require('chalk'); + +/** + * File IO, error reporting. + */ +module.exports = { + /** + * + */ + read: function(file, aaa, bbb) { + return new Promise(function(resolve) { + fs.readFile(file, 'utf8', function(err, str) { + console.log('Read ' + file); + module.exports.error(err); + resolve(str || ''); + }); + }); + }, + + /** + * + */ + write: function(file, str) { + return new Promise(function(resolve) { + if (str) { + fs.writeFile(file, str, module.exports.error, resolve); + console.log('Write ' + file); + } + else { + resolve(); + } + }); + }, + + /** + * + */ + error: function(e) { + if (e !== null) { + console.log(chalk.yellow(e)); + } + } +}; diff --git a/server/meteo.js b/server/meteo.js new file mode 100644 index 0000000..6e90c54 --- /dev/null +++ b/server/meteo.js @@ -0,0 +1,165 @@ +'use strict' + +var downloader = require('./downloader.js'); +var IO = require('./io.js'); +var NOAA = require('./noaa.js'); + +module.exports = { + //========================= + // Read-only vars + //========================= + dirs: { + txt: 'data/meteo/txt/', + json: 'data/meteo/json/' + }, + + months: [null, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], + + //========================== + // Downloads + //========================== + /** + * + */ + getYear: function(station, year) { + var path = module.exports.dirs.txt + station + '/'; + var filename = station + 'h' + year + '.txt'; + + var url = 'http://www.ndbc.noaa.gov/view_text_file.php?'+ + 'filename=' + filename + '.gz&dir=data/historical/stdmet/'; + + downloader.mkdir(path); + return downloader.download(url, path + filename); + }, + + /** + * + */ + getMonth: function(station, month, year) { + var path = module.exports.dirs.txt + station + '/'; + var filename = station + month.toString(16) + year + '.txt'; + + var url = 'http://www.ndbc.noaa.gov/view_text_file.php?' + + 'filename=' + filename + '.gz&dir=data/stdmet/' + module.exports.months[month] + '/'; + + downloader.mkdir(path); + return downloader.download(url, path + filename); + }, + + /** + * + */ + getNewest: function(station, month, year) { + var path = module.exports.dirs.txt + station + '/'; + var filename = station + month.toString(16) + year + '-newest.txt'; + + var url = 'http://www.ndbc.noaa.gov/data/stdmet/' + module.exports.months[month] + '/' + station + '.txt'; + + downloader.mkdir(path); + return downloader.download(url, path + filename); + }, + + /** + * + */ + getCurrent: function(station) { + var filename = station + '.txt'; + + var url = 'http://www.ndbc.noaa.gov/data/realtime2/' + filename; + + return downloader.download(url, dir + filename); + }, + + //========================== + // Transformations + //========================== + + /** + * + */ + parseAllMonths: function(station, year) { + var arr = []; + var txtPath = module.exports.dirs.txt + station + '/'; + var jsonPath = module.exports.dirs.json + '/'; + + for (var month = 1; month <= 12; month++) { + arr.push(IO.read(txtPath + station + month.toString(16) + year + '-newest.txt').then(NOAA.parseTxt)); + arr.push(IO.read(txtPath + station + month.toString(16) + year + '.txt').then(NOAA.parseTxt)); + } + + return Promise.all(arr) + .then(NOAA.aggregate) + .then(NOAA.convert) + .then(function(str) { + IO.write(jsonPath + station + '-' + year + '.json', str); + }); + }, + + /** + * + */ + parseYear: function(station, year) { + var txtPath = module.exports.dirs.txt + station + '/'; + var jsonPath = module.exports.dirs.json + '/'; + + return IO.read(txtPath + station + 'h' + year + '.txt') + .then(NOAA.parseTxt) + .then(NOAA.convert) + .then(function(str) { + IO.write(jsonPath + station + '-' + year + '.json', str); + }); + }, + + //======================================== + // Promise Collections used in loops + //======================================== + + /** + * + */ + getAllYears: function(station, startYear, endYear) { + var arr = []; + for (var year = startYear; year <= endYear; year++) { + arr.push(module.exports.getYear(station, year)); + } + + return Promise.all(arr); + }, + + /** + * + */ + getAllMonths: function(station, year) { + var arr = []; + for (var month = 1; month <= 12; month++) { + arr.push(module.exports.getMonth(station, month, year)); + } + + return Promise.all(arr); + }, + + /** + * + */ + getAllNewest: function(station, year) { + var arr = []; + for (var month = 1; month <= 12; month++) { + arr.push(module.exports.getNewest(station, month, year)); + } + + return Promise.all(arr); + }, + + /** + * + */ + parseAllYears: function(station, startYear, endYear) { + var arr = []; + + for (var year = startYear; year <= endYear; year++) { + arr.push(module.exports.parseYear(station, year)); + } + + return Promise.all(arr); + }, +}; diff --git a/server/meteorological.js b/server/meteorological.js deleted file mode 100644 index 7435e09..0000000 --- a/server/meteorological.js +++ /dev/null @@ -1,68 +0,0 @@ -'use strict' - -var downloader = require('./downloader.js'); -var dir = 'data/meteorological/' -var Promise = require('es6-promise').Promise; - -module.exports = { - months: ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], - - /** - * - */ - getYear: function(buoy, yyyy) { - var filename = buoy + 'h' + yyyy + '.txt'; - - var url = 'http://www.ndbc.noaa.gov/view_text_file.php?'+ - 'filename=' + filename + '.gz&dir=data/historical/stdmet/'; - - return downloader.download(url, dir + filename); - }, - - /** - * - */ - getMonth: function(buoy, m, yyyy) { - var month = m + 1; - month = (month == 10 ? 'a' : month); - month = (month == 11 ? 'b' : month); - month = (month == 12 ? 'c' : month); - - var filename = buoy + month.toString() + yyyy + '.txt'; - - var url = 'http://www.ndbc.noaa.gov/view_text_file.php?' + - 'filename=' + filename + '.gz&dir=data/stdmet/' + this.months[m] + '/'; - - var path = dir + buoy + '/'; - downloader.mkdir(path); - return downloader.download(url, path + filename); - }, - - /** - * - */ - getCurrent: function(buoy) { - var filename = buoy + '.txt'; - - var url = 'http://www.ndbc.noaa.gov/data/realtime2/' + filename; - - return downloader.download(url, dir + filename); - }, - - /** - * - */ - getInconsistent: function(buoy, m) { - var url = 'http://www.ndbc.noaa.gov/data/stdmet/' + this.months[m] + '/' + buoy + '.txt'; - - var month = m + 1; - month = (month == 10 ? 'a' : month); - month = (month == 11 ? 'b' : month); - month = (month == 12 ? 'c' : month); - - var path = dir + buoy + '/'; - downloader.mkdir(path); - - return downloader.download(url, path + buoy.toString() + month + '2015-newest.txt'); - } -}; diff --git a/server/noaa.js b/server/noaa.js new file mode 100644 index 0000000..f713a3a --- /dev/null +++ b/server/noaa.js @@ -0,0 +1,88 @@ +/** + * NOAA-specific filtering. + */ +module.exports = { + /** + * + */ + splitLine: function(str) { + var arr = str.split(/\s+/); + arr.filter(function(val) { return (val.length > 0); }) + return arr; + }, + + /** + * Receives a stream from a file read event. + */ + parseTxt: function(str) { + console.log('Parsing NOAA space-delimited columnar data into JSON.'); + + var arr = []; + var cols = null; + var lines = str.split('\n'); + var len = lines.length; + + if (len > 8) { + for (var i = 0; i < len; i++) { + cols = module.exports.splitLine(lines[i]); + cols.length > 0 ? arr.push(cols) : null; + } + } + + return arr; + }, + + /** + * After all files have been parsed, Promises.all passes them all as an array. + * This function does filtering on them and finalizes a JSON string. + */ + convert: function(arr) { + console.log('Converting aggregated month files to JSON.'); + + // Sort. + var sorted = arr.sort(function(a, b) { + var dateA = parseInt([a[0], a[1], a[2], a[3], ('00' + a[4]).substr(-2)].join('')) || 0; + var dateB = parseInt([b[0], b[1], b[2], b[3], ('00' + b[4]).substr(-2)].join('')) || 0; + + return dateA - dateB; + }); + + // Filter for multiple headings/units rows. + var result = sorted.filter(function(row) { + return !(row[0] === '#YY' || row[0] === '#yr' || row.length === 1); + }); + + // Convert to JSON that can later be read easily. + var str = null; + if (result.length > 0) { + str = JSON.stringify(result) + str = str.replace(/\],\[/g, '],\n['); + } + + return str; + }, + + /** + * Used to aggregate month files after they have been split into a lines array. + * Each line has been split into individual elements. + * The array passed to this function is therefore an array of two dimensional arrays. + * + * This function adds non-empty lines to a common result set. + */ + aggregate: function(arr) { + console.log('Aggregating month files for the year.'); + + var tmp = []; + + arr.forEach(function(rows) { + if (rows.length === 0) { + return; + } + + tmp = tmp.concat(rows); + }); + + return tmp; + } +}; + diff --git a/server/stations.js b/server/stations.js index 61018f3..31fc88a 100644 --- a/server/stations.js +++ b/server/stations.js @@ -1,24 +1,29 @@ 'use strict' -var fs = require('fs'); var downloader = require('./downloader'); +var IO = require('./io'); var xml2js = require ('xml2js'); -var dir = 'data/stations/'; +// var dir = 'data/stations/'; module.exports = { + dirs: { + xml: 'data/stations/xml/', + json: 'data/stations/json/' + }, + /** * Add station IDs here, A-Z 0-9 */ - stations: [ - 'ANVC1', - 'BDXC1', - 'CECC1', - 'CPXC1', - 'HBYC1', - 'ICAC1', - 'NTBC1', - 'PRYC1', - 'PTGC1', + ids: [ + // 'ANVC1', + // 'BDXC1', + // 'CECC1', + // 'CPXC1', + // 'HBYC1', + // 'ICAC1', + // 'NTBC1', + // 'PRYC1', + // 'PTGC1', '46011', '46012', '46013', @@ -55,78 +60,37 @@ module.exports = { /** * Downloads each station's data XML. */ - downloadAllMetadata: function() { - var len = this.stations.length; - var url; - - for (var i = 0; i < len; i++) { - url = 'http://www.ndbc.noaa.gov/get_observation_as_xml.php?station=' + this.stations[i]; - downloader.download(url, dir + this.stations[i] + '.txt'); - } + getMetadata: function(station) { + var url = 'http://www.ndbc.noaa.gov/get_observation_as_xml.php?station=' + station; + return downloader.download(url, module.exports.dirs.xml + station + '.xml'); }, /** * */ - parseAllMetadata: function() { - function done() { - if (data.length === len) { - fs.writeFile(outfile, data.join('\n'), function(err) { - if (err) { - throw new Error(err) - } - - console.log('Station data written to ' + outfile); - }); - } - }; - - function next() { - // Wait for other concurrent files to finish. - if (count !== data.length) { - return; - } - - var concurrent = 3; - var tmp; - - for (var i = 0; i < concurrent; i++) { - tmp = count + i; - - if (tmp === len) { - break; + parse: function(xml) { + return new Promise(function(resolve) { + xml2js.parseString(xml, function(err, json) { + var str = null; + + // Do not stringify if null. + if (json) { + str = JSON.stringify(json.observation.$) } - // console.log(tmp + "(" + count + ")" + " reading " + dir + module.exports.stations[tmp] + '.txt'); - fs.readFile(dir + module.exports.stations[tmp] + '.txt', 'utf8', thenParse); - } - - count += concurrent; - }; - - function thenParse(err, xml) { - if (err) { - throw new Error(err) - } - - xml2js.parseString(xml, thenReport); - }; - - function thenReport(err, json) { - if (err) { - throw new Error(err) - } - - data.push(JSON.stringify(json.observation.$)); - next(); - done(); - }; - - var outfile = 'data-stations.json'; - var len = module.exports.stations.length; - var count = 0; - var data = []; + resolve(str); + }); + }); + }, - next(); + /** + * + */ + parseStation: function(station) { + return IO.read(module.exports.dirs.xml + station + '.xml') + .then(module.exports.parse) + .then(function(str) { + IO.write(module.exports.dirs.json + station + '.json', str) + }) }, };