Copying Databases in Node

Yes, there are tons of great, StackOverflow-accepted ways to copy giant databases betweens servers. But, sometimes you have a special need for a project that requires selectively copying subsets of data instead of an entire 15GB database.

So, when you have to programmatically copy rows between databases in Node...


Problems with LIMIT

When I first attempted this, I ran into several problems:

  • I didn't use a LIMIT, and ran out of memory.
  • I specified a LIMIT, but the bulk INSERT failed for some reason.
  • I specified a lower LIMIT, but the INSERT performed differently on each table.
  • I specified an arbitrarily low LIMIT, but on simple tables, it took longer.

I realized there's no perfect way of doing this, and I surely didn't want to insert a single row at a time.

Group INSERTs by Size, not Rows

First, let's use the following tools:

The premise is to copy N rows that roughly equate to M amount of memory.

1. Setup dependencies

var knex = require('knex');
var Promise = require('bluebird');
var ProgressBar = require('progress');
var local = knex.initialize(...);
var remote = knex.initialize(...);

2. Get table stats

var getStats = function(table) {
// Query total number of rows (`information_schema` is just an estimate)
return remote(table).count('_').then(function(response) {
stats.totalRows = response.shift()['count(_)'];
})
// Request DB estimates for table size (approximate)
.then(function() {
return remote('information*schema.TABLES').select('data_length', 'index_length').where({
table_name: table,
table_schema: this.remote.client.connectionSettings.database,
});
})
// Calculate stats for # of rows & iterations to stay within memory limits
.then(function(result) {
// Base variables for insertion chunk calculations
stats.memoryLimit = 25 * 1024 * 1024; // 25Mb
stats.dataLength = result[0]['data_length'];
stats.indexLength = result[0]['index_length'];
stats.totalLength = stats.dataLength + stats.indexLength;
stats.rowLimit = Math.min(stats.totalRows, Math.ceil(stats.totalRows * stats.memoryLimit / stats.totalLength));
return stats;
}})
};

3. Copy table in groups

Now that we have a lot of stats based around a 25Mb memory limit, now we can copy the table:

var copyTable = function(table) {
return getTableStats(table).then(function(stats) {
// Empty array to make it easy to iterate for insertion group
var iterations = new Array(Math.ceil(stats.totalRows / stats.rowLimit));
// Promise.reduce ensures iterations are done sequentially, whereas
// Promise.all/map runs in parallel
return Promise.reduce(
iterations,
function(previous, iteration, i) {
var rowOffset = stats.rowLimit * i;
return copyRows(table, rowOffset, stats.rowLimit, stats.totalRows);
},
[]
);
});
};

4. Copy individual rows

All that our copyTable function is missing is copyRows:

var copyRows = function(table, offset, limit, total) {
return remote(table)
.select("*")
.offset(offset)
.limit(limit)
.then(function(rows) {
// "[DATABASE] Inserting records N-M of O into TABLE"
var msg = [
"[" + chalk.yellow(local.client.connectionSettings.database) + "]",
"Inserting records",
chalk.green(offset + 1) + "-" + chalk.green(offset + rows.length),
"of",
chalk.green(total),
"into",
chalk.yellow(table),
"[:bar] :percent :etas"
].join(" ");
var bar = new Progress(msg, {
total: rows.length,
width: 20
});
// Insert each record individually because it's actually faster (!?)
// and gives us a pretty progress bar.
return Promise.map(rows, function(row) {
return local(table)
.insert(row)
.then(function() {
bar.tick();
});
});
});
};

5. Finish

Finally, kick off the whole promise chain:

copyTable("something_huge").then(
function() {
console.log("Success!");
process.exit(0);
},
function(err) {
console.error(err);
process.exit(1);
}
);

demo