001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.mapreduce;
019
020import java.io.IOException;
021import java.util.HashMap;
022import java.util.Map;
023import java.util.UUID;
024import org.apache.hadoop.conf.Configured;
025import org.apache.hadoop.fs.FileSystem;
026import org.apache.hadoop.fs.Path;
027import org.apache.hadoop.hbase.HBaseConfiguration;
028import org.apache.hadoop.hbase.HConstants;
029import org.apache.hadoop.hbase.TableName;
030import org.apache.hadoop.hbase.client.Admin;
031import org.apache.hadoop.hbase.client.Connection;
032import org.apache.hadoop.hbase.client.ConnectionFactory;
033import org.apache.hadoop.hbase.client.Scan;
034import org.apache.hadoop.hbase.mapreduce.Import.CellImporter;
035import org.apache.hadoop.hbase.mapreduce.Import.Importer;
036import org.apache.hadoop.hbase.tool.LoadIncrementalHFiles;
037import org.apache.hadoop.hbase.util.Bytes;
038import org.apache.hadoop.hbase.util.CommonFSUtils;
039import org.apache.hadoop.mapreduce.Job;
040import org.apache.hadoop.util.Tool;
041import org.apache.hadoop.util.ToolRunner;
042import org.apache.yetus.audience.InterfaceAudience;
043import org.slf4j.Logger;
044import org.slf4j.LoggerFactory;
045
046/**
047 * Tool used to copy a table to another one which can be on a different setup. It is also
048 * configurable with a start and time as well as a specification of the region server implementation
049 * if different from the local cluster.
050 */
051@InterfaceAudience.Public
052public class CopyTable extends Configured implements Tool {
053  private static final Logger LOG = LoggerFactory.getLogger(CopyTable.class);
054
055  final static String NAME = "copytable";
056  long startTime = 0;
057  long endTime = HConstants.LATEST_TIMESTAMP;
058  int batch = Integer.MAX_VALUE;
059  int cacheRow = -1;
060  int versions = -1;
061  String tableName = null;
062  String startRow = null;
063  String stopRow = null;
064  String dstTableName = null;
065  String peerAddress = null;
066  String families = null;
067  boolean allCells = false;
068  static boolean shuffle = false;
069
070  boolean bulkload = false;
071  Path bulkloadDir = null;
072
073  boolean readingSnapshot = false;
074  String snapshot = null;
075
076  private final static String JOB_NAME_CONF_KEY = "mapreduce.job.name";
077
078  private Path generateUniqTempDir(boolean withDirCreated) throws IOException {
079    FileSystem fs = CommonFSUtils.getCurrentFileSystem(getConf());
080    Path dir = new Path(fs.getWorkingDirectory(), NAME);
081    if (!fs.exists(dir)) {
082      fs.mkdirs(dir);
083    }
084    Path newDir = new Path(dir, UUID.randomUUID().toString());
085    if (withDirCreated) {
086      fs.mkdirs(newDir);
087    }
088    return newDir;
089  }
090
091  private void initCopyTableMapperReducerJob(Job job, Scan scan) throws IOException {
092    Class<? extends TableMapper> mapper = bulkload ? CellImporter.class : Importer.class;
093    if (readingSnapshot) {
094      TableMapReduceUtil.initTableSnapshotMapperJob(snapshot, scan, mapper, null, null, job, true,
095        generateUniqTempDir(true));
096    } else {
097      TableMapReduceUtil.initTableMapperJob(tableName, scan, mapper, null, null, job);
098    }
099  }
100
101  /**
102   * Sets up the actual job.
103   * @param args The command line parameters.
104   * @return The newly created job.
105   * @throws IOException When setting up the job fails.
106   */
107  public Job createSubmittableJob(String[] args) throws IOException {
108    if (!doCommandLine(args)) {
109      return null;
110    }
111
112    String jobName = NAME + "_" + (tableName == null ? snapshot : tableName);
113    Job job = Job.getInstance(getConf(), getConf().get(JOB_NAME_CONF_KEY, jobName));
114    job.setJarByClass(CopyTable.class);
115    Scan scan = new Scan();
116
117    scan.setBatch(batch);
118    scan.setCacheBlocks(false);
119
120    if (cacheRow > 0) {
121      scan.setCaching(cacheRow);
122    } else {
123      scan.setCaching(getConf().getInt(HConstants.HBASE_CLIENT_SCANNER_CACHING, 100));
124    }
125
126    scan.setTimeRange(startTime, endTime);
127
128    if (allCells) {
129      scan.setRaw(true);
130    }
131    if (shuffle) {
132      job.getConfiguration().set(TableInputFormat.SHUFFLE_MAPS, "true");
133    }
134    if (versions >= 0) {
135      scan.readVersions(versions);
136    }
137
138    if (startRow != null) {
139      scan.withStartRow(Bytes.toBytesBinary(startRow));
140    }
141
142    if (stopRow != null) {
143      scan.withStopRow(Bytes.toBytesBinary(stopRow));
144    }
145
146    if (families != null) {
147      String[] fams = families.split(",");
148      Map<String, String> cfRenameMap = new HashMap<>();
149      for (String fam : fams) {
150        String sourceCf;
151        if (fam.contains(":")) {
152          // fam looks like "sourceCfName:destCfName"
153          String[] srcAndDest = fam.split(":", 2);
154          sourceCf = srcAndDest[0];
155          String destCf = srcAndDest[1];
156          cfRenameMap.put(sourceCf, destCf);
157        } else {
158          // fam is just "sourceCf"
159          sourceCf = fam;
160        }
161        scan.addFamily(Bytes.toBytes(sourceCf));
162      }
163      Import.configureCfRenaming(job.getConfiguration(), cfRenameMap);
164    }
165    job.setNumReduceTasks(0);
166
167    if (bulkload) {
168      initCopyTableMapperReducerJob(job, scan);
169
170      // We need to split the inputs by destination tables so that output of Map can be bulk-loaded.
171      TableInputFormat.configureSplitTable(job, TableName.valueOf(dstTableName));
172
173      bulkloadDir = generateUniqTempDir(false);
174      LOG.info("HFiles will be stored at " + this.bulkloadDir);
175      HFileOutputFormat2.setOutputPath(job, bulkloadDir);
176      try (Connection conn = ConnectionFactory.createConnection(getConf());
177        Admin admin = conn.getAdmin()) {
178        HFileOutputFormat2.configureIncrementalLoadMap(job,
179          admin.getDescriptor((TableName.valueOf(dstTableName))));
180      }
181    } else {
182      initCopyTableMapperReducerJob(job, scan);
183      TableMapReduceUtil.initTableReducerJob(dstTableName, null, job, null, peerAddress);
184    }
185
186    return job;
187  }
188
189  /*
190   * @param errorMsg Error message. Can be null.
191   */
192  private static void printUsage(final String errorMsg) {
193    if (errorMsg != null && errorMsg.length() > 0) {
194      System.err.println("ERROR: " + errorMsg);
195    }
196    System.err.println("Usage: CopyTable [general options] [--starttime=X] [--endtime=Y] "
197      + "[--new.name=NEW] [--peer.adr=ADR] <tablename | snapshotName>");
198    System.err.println();
199    System.err.println("Options:");
200    System.err.println(" rs.class     hbase.regionserver.class of the peer cluster");
201    System.err.println("              specify if different from current cluster");
202    System.err.println(" rs.impl      hbase.regionserver.impl of the peer cluster");
203    System.err.println(" startrow     the start row");
204    System.err.println(" stoprow      the stop row");
205    System.err.println(" starttime    beginning of the time range (unixtime in millis)");
206    System.err.println("              without endtime means from starttime to forever");
207    System.err.println(" endtime      end of the time range.  Ignored if no starttime specified.");
208    System.err.println(" versions     number of cell versions to copy");
209    System.err.println(" new.name     new table's name");
210    System.err.println(" peer.adr     Address of the peer cluster given in the format");
211    System.err.println("              hbase.zookeeper.quorum:hbase.zookeeper.client"
212      + ".port:zookeeper.znode.parent");
213    System.err.println(" families     comma-separated list of families to copy");
214    System.err.println("              To copy from cf1 to cf2, give sourceCfName:destCfName. ");
215    System.err.println("              To keep the same name, just give \"cfName\"");
216    System.err.println(" all.cells    also copy delete markers and deleted cells");
217    System.err
218      .println(" bulkload     Write input into HFiles and bulk load to the destination " + "table");
219    System.err.println(" snapshot     Copy the data from snapshot to destination table.");
220    System.err.println();
221    System.err.println("Args:");
222    System.err.println(" tablename    Name of the table to copy");
223    System.err.println();
224    System.err.println("Examples:");
225    System.err
226      .println(" To copy 'TestTable' to a cluster that uses replication for a 1 hour window:");
227    System.err.println(" $ hbase "
228      + "org.apache.hadoop.hbase.mapreduce.CopyTable --starttime=1265875194289 --endtime=1265878794289 "
229      + "--peer.adr=server1,server2,server3:2181:/hbase --families=myOldCf:myNewCf,cf2,cf3 TestTable ");
230    System.err.println(" To copy data from 'sourceTableSnapshot' to 'destTable': ");
231    System.err.println(" $ hbase org.apache.hadoop.hbase.mapreduce.CopyTable "
232      + "--snapshot --new.name=destTable sourceTableSnapshot");
233    System.err.println(" To copy data from 'sourceTableSnapshot' and bulk load to 'destTable': ");
234    System.err.println(" $ hbase org.apache.hadoop.hbase.mapreduce.CopyTable "
235      + "--new.name=destTable --snapshot --bulkload sourceTableSnapshot");
236    System.err.println("For performance consider the following general option:\n"
237      + "  It is recommended that you set the following to >=100. A higher value uses more memory but\n"
238      + "  decreases the round trip time to the server and may increase performance.\n"
239      + "    -Dhbase.client.scanner.caching=100\n"
240      + "  The following should always be set to false, to prevent writing data twice, which may produce \n"
241      + "  inaccurate results.\n" + "    -Dmapreduce.map.speculative=false");
242  }
243
244  private boolean doCommandLine(final String[] args) {
245    if (args.length < 1) {
246      printUsage(null);
247      return false;
248    }
249    try {
250      for (int i = 0; i < args.length; i++) {
251        String cmd = args[i];
252        if (cmd.equals("-h") || cmd.startsWith("--h")) {
253          printUsage(null);
254          return false;
255        }
256
257        final String startRowArgKey = "--startrow=";
258        if (cmd.startsWith(startRowArgKey)) {
259          startRow = cmd.substring(startRowArgKey.length());
260          continue;
261        }
262
263        final String stopRowArgKey = "--stoprow=";
264        if (cmd.startsWith(stopRowArgKey)) {
265          stopRow = cmd.substring(stopRowArgKey.length());
266          continue;
267        }
268
269        final String startTimeArgKey = "--starttime=";
270        if (cmd.startsWith(startTimeArgKey)) {
271          startTime = Long.parseLong(cmd.substring(startTimeArgKey.length()));
272          continue;
273        }
274
275        final String endTimeArgKey = "--endtime=";
276        if (cmd.startsWith(endTimeArgKey)) {
277          endTime = Long.parseLong(cmd.substring(endTimeArgKey.length()));
278          continue;
279        }
280
281        final String batchArgKey = "--batch=";
282        if (cmd.startsWith(batchArgKey)) {
283          batch = Integer.parseInt(cmd.substring(batchArgKey.length()));
284          continue;
285        }
286
287        final String cacheRowArgKey = "--cacheRow=";
288        if (cmd.startsWith(cacheRowArgKey)) {
289          cacheRow = Integer.parseInt(cmd.substring(cacheRowArgKey.length()));
290          continue;
291        }
292
293        final String versionsArgKey = "--versions=";
294        if (cmd.startsWith(versionsArgKey)) {
295          versions = Integer.parseInt(cmd.substring(versionsArgKey.length()));
296          continue;
297        }
298
299        final String newNameArgKey = "--new.name=";
300        if (cmd.startsWith(newNameArgKey)) {
301          dstTableName = cmd.substring(newNameArgKey.length());
302          continue;
303        }
304
305        final String peerAdrArgKey = "--peer.adr=";
306        if (cmd.startsWith(peerAdrArgKey)) {
307          peerAddress = cmd.substring(peerAdrArgKey.length());
308          continue;
309        }
310
311        final String familiesArgKey = "--families=";
312        if (cmd.startsWith(familiesArgKey)) {
313          families = cmd.substring(familiesArgKey.length());
314          continue;
315        }
316
317        if (cmd.startsWith("--all.cells")) {
318          allCells = true;
319          continue;
320        }
321
322        if (cmd.startsWith("--bulkload")) {
323          bulkload = true;
324          continue;
325        }
326
327        if (cmd.startsWith("--shuffle")) {
328          shuffle = true;
329          continue;
330        }
331
332        if (cmd.startsWith("--snapshot")) {
333          readingSnapshot = true;
334          continue;
335        }
336
337        if (i == args.length - 1) {
338          if (readingSnapshot) {
339            snapshot = cmd;
340          } else {
341            tableName = cmd;
342          }
343        } else {
344          printUsage("Invalid argument '" + cmd + "'");
345          return false;
346        }
347      }
348      if (dstTableName == null && peerAddress == null) {
349        printUsage("At least a new table name or a peer address must be specified");
350        return false;
351      }
352      if ((endTime != 0) && (startTime > endTime)) {
353        printUsage("Invalid time range filter: starttime=" + startTime + " >  endtime=" + endTime);
354        return false;
355      }
356
357      if (bulkload && peerAddress != null) {
358        printUsage("Remote bulkload is not supported!");
359        return false;
360      }
361
362      if (readingSnapshot && peerAddress != null) {
363        printUsage("Loading data from snapshot to remote peer cluster is not supported.");
364        return false;
365      }
366
367      if (readingSnapshot && dstTableName == null) {
368        printUsage("The --new.name=<table> for destination table should be "
369          + "provided when copying data from snapshot .");
370        return false;
371      }
372
373      if (readingSnapshot && snapshot == null) {
374        printUsage("Snapshot shouldn't be null when --snapshot is enabled.");
375        return false;
376      }
377
378      // set dstTableName if necessary
379      if (dstTableName == null) {
380        dstTableName = tableName;
381      }
382    } catch (Exception e) {
383      e.printStackTrace();
384      printUsage("Can't start because " + e.getMessage());
385      return false;
386    }
387    return true;
388  }
389
390  /**
391   * Main entry point.
392   * @param args The command line parameters.
393   * @throws Exception When running the job fails.
394   */
395  public static void main(String[] args) throws Exception {
396    int ret = ToolRunner.run(HBaseConfiguration.create(), new CopyTable(), args);
397    System.exit(ret);
398  }
399
400  @Override
401  public int run(String[] args) throws Exception {
402    Job job = createSubmittableJob(args);
403    if (job == null) return 1;
404    if (!job.waitForCompletion(true)) {
405      LOG.info("Map-reduce job failed!");
406      if (bulkload) {
407        LOG.info("Files are not bulkloaded!");
408      }
409      return 1;
410    }
411    int code = 0;
412    if (bulkload) {
413      LOG.info("Trying to bulk load data to destination table: " + dstTableName);
414      LOG.info("command: ./bin/hbase org.apache.hadoop.hbase.tool.LoadIncrementalHFiles {} {}",
415        this.bulkloadDir.toString(), this.dstTableName);
416      code = new LoadIncrementalHFiles(this.getConf())
417        .run(new String[] { this.bulkloadDir.toString(), this.dstTableName });
418      if (code == 0) {
419        // bulkloadDir is deleted only LoadIncrementalHFiles was successful so that one can rerun
420        // LoadIncrementalHFiles.
421        FileSystem fs = CommonFSUtils.getCurrentFileSystem(getConf());
422        if (!fs.delete(this.bulkloadDir, true)) {
423          LOG.error("Deleting folder " + bulkloadDir + " failed!");
424          code = 1;
425        }
426      }
427    }
428    return code;
429  }
430}