1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.hadoop.hbase.util;
19
20 import java.io.Closeable;
21 import java.io.FileNotFoundException;
22 import java.io.IOException;
23 import java.io.InterruptedIOException;
24 import java.io.PrintWriter;
25 import java.io.StringWriter;
26 import java.net.InetAddress;
27 import java.net.URI;
28 import java.util.ArrayList;
29 import java.util.Arrays;
30 import java.util.Collection;
31 import java.util.Collections;
32 import java.util.Comparator;
33 import java.util.HashMap;
34 import java.util.HashSet;
35 import java.util.Iterator;
36 import java.util.List;
37 import java.util.Map;
38 import java.util.Map.Entry;
39 import java.util.Set;
40 import java.util.SortedMap;
41 import java.util.SortedSet;
42 import java.util.TreeMap;
43 import java.util.TreeSet;
44 import java.util.concurrent.Callable;
45 import java.util.concurrent.ConcurrentSkipListMap;
46 import java.util.concurrent.ExecutionException;
47 import java.util.concurrent.ExecutorService;
48 import java.util.concurrent.Executors;
49 import java.util.concurrent.Future;
50 import java.util.concurrent.FutureTask;
51 import java.util.concurrent.ScheduledThreadPoolExecutor;
52 import java.util.concurrent.TimeUnit;
53 import java.util.concurrent.TimeoutException;
54 import java.util.concurrent.atomic.AtomicBoolean;
55 import java.util.concurrent.atomic.AtomicInteger;
56
57 import com.google.common.base.Joiner;
58 import com.google.common.base.Preconditions;
59 import com.google.common.collect.ImmutableList;
60 import com.google.common.collect.Lists;
61 import com.google.common.collect.Multimap;
62 import com.google.common.collect.Ordering;
63 import com.google.common.collect.TreeMultimap;
64 import com.google.protobuf.ServiceException;
65
66 import org.apache.commons.lang.StringUtils;
67 import org.apache.commons.logging.Log;
68 import org.apache.commons.logging.LogFactory;
69 import org.apache.hadoop.hbase.classification.InterfaceAudience;
70 import org.apache.hadoop.hbase.classification.InterfaceStability;
71 import org.apache.hadoop.conf.Configuration;
72 import org.apache.hadoop.conf.Configured;
73 import org.apache.hadoop.fs.FSDataOutputStream;
74 import org.apache.hadoop.fs.FileStatus;
75 import org.apache.hadoop.fs.FileSystem;
76 import org.apache.hadoop.fs.Path;
77 import org.apache.hadoop.fs.permission.FsAction;
78 import org.apache.hadoop.fs.permission.FsPermission;
79 import org.apache.hadoop.hbase.Abortable;
80 import org.apache.hadoop.hbase.Cell;
81 import org.apache.hadoop.hbase.ClusterStatus;
82 import org.apache.hadoop.hbase.CoordinatedStateException;
83 import org.apache.hadoop.hbase.HBaseConfiguration;
84 import org.apache.hadoop.hbase.HBaseInterfaceAudience;
85 import org.apache.hadoop.hbase.HColumnDescriptor;
86 import org.apache.hadoop.hbase.HConstants;
87 import org.apache.hadoop.hbase.HRegionInfo;
88 import org.apache.hadoop.hbase.HRegionLocation;
89 import org.apache.hadoop.hbase.HTableDescriptor;
90 import org.apache.hadoop.hbase.KeyValue;
91 import org.apache.hadoop.hbase.MasterNotRunningException;
92 import org.apache.hadoop.hbase.RegionLocations;
93 import org.apache.hadoop.hbase.ServerName;
94 import org.apache.hadoop.hbase.TableName;
95 import org.apache.hadoop.hbase.ZooKeeperConnectionException;
96 import org.apache.hadoop.hbase.MetaTableAccessor;
97 import org.apache.hadoop.hbase.classification.InterfaceAudience;
98 import org.apache.hadoop.hbase.classification.InterfaceStability;
99 import org.apache.hadoop.hbase.client.Admin;
100 import org.apache.hadoop.hbase.client.ClusterConnection;
101 import org.apache.hadoop.hbase.client.ConnectionFactory;
102 import org.apache.hadoop.hbase.client.Delete;
103 import org.apache.hadoop.hbase.client.Get;
104 import org.apache.hadoop.hbase.client.HBaseAdmin;
105 import org.apache.hadoop.hbase.client.HConnectable;
106 import org.apache.hadoop.hbase.client.HConnection;
107 import org.apache.hadoop.hbase.client.HConnectionManager;
108 import org.apache.hadoop.hbase.client.MetaScanner;
109 import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
110 import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitorBase;
111 import org.apache.hadoop.hbase.client.Put;
112 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
113 import org.apache.hadoop.hbase.client.Result;
114 import org.apache.hadoop.hbase.client.RowMutations;
115 import org.apache.hadoop.hbase.client.Table;
116 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
117 import org.apache.hadoop.hbase.io.hfile.HFile;
118 import org.apache.hadoop.hbase.master.MasterFileSystem;
119 import org.apache.hadoop.hbase.master.RegionState;
120 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
121 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService.BlockingInterface;
122 import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
123 import org.apache.hadoop.hbase.regionserver.HRegion;
124 import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
125 import org.apache.hadoop.hbase.regionserver.StoreFileInfo;
126 import org.apache.hadoop.hbase.security.AccessDeniedException;
127 import org.apache.hadoop.hbase.security.UserProvider;
128 import org.apache.hadoop.hbase.util.Bytes.ByteArrayComparator;
129 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
130 import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
131 import org.apache.hadoop.hbase.util.hbck.TableIntegrityErrorHandler;
132 import org.apache.hadoop.hbase.util.hbck.TableIntegrityErrorHandlerImpl;
133 import org.apache.hadoop.hbase.util.hbck.TableLockChecker;
134 import org.apache.hadoop.hbase.wal.WALSplitter;
135 import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
136 import org.apache.hadoop.hbase.zookeeper.ZKTableStateClientSideReader;
137 import org.apache.hadoop.hbase.zookeeper.ZKTableStateManager;
138 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
139 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
140 import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
141 import org.apache.hadoop.io.IOUtils;
142 import org.apache.hadoop.ipc.RemoteException;
143 import org.apache.hadoop.security.UserGroupInformation;
144 import org.apache.hadoop.util.ReflectionUtils;
145 import org.apache.hadoop.util.Tool;
146 import org.apache.hadoop.util.ToolRunner;
147 import org.apache.zookeeper.KeeperException;
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194 @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.TOOLS)
195 @InterfaceStability.Evolving
196 public class HBaseFsck extends Configured implements Closeable {
197 public static final long DEFAULT_TIME_LAG = 60000;
198 public static final long DEFAULT_SLEEP_BEFORE_RERUN = 10000;
199 private static final int MAX_NUM_THREADS = 50;
200 private static boolean rsSupportsOffline = true;
201 private static final int DEFAULT_OVERLAPS_TO_SIDELINE = 2;
202 private static final int DEFAULT_MAX_MERGE = 5;
203 private static final String TO_BE_LOADED = "to_be_loaded";
204 private static final String HBCK_LOCK_FILE = "hbase-hbck.lock";
205 private static final int DEFAULT_MAX_LOCK_FILE_ATTEMPTS = 5;
206 private static final int DEFAULT_LOCK_FILE_ATTEMPT_SLEEP_INTERVAL = 200;
207
208
209
210
211 private static final Log LOG = LogFactory.getLog(HBaseFsck.class.getName());
212 private ClusterStatus status;
213 private ClusterConnection connection;
214 private Admin admin;
215 private Table meta;
216
217 protected ExecutorService executor;
218 private long startMillis = System.currentTimeMillis();
219 private HFileCorruptionChecker hfcc;
220 private int retcode = 0;
221 private Path HBCK_LOCK_PATH;
222 private FSDataOutputStream hbckOutFd;
223
224
225
226 private final AtomicBoolean hbckLockCleanup = new AtomicBoolean(false);
227
228
229
230
231 private static boolean details = false;
232 private long timelag = DEFAULT_TIME_LAG;
233 private boolean fixAssignments = false;
234 private boolean fixMeta = false;
235 private boolean checkHdfs = true;
236 private boolean fixHdfsHoles = false;
237 private boolean fixHdfsOverlaps = false;
238 private boolean fixHdfsOrphans = false;
239 private boolean fixTableOrphans = false;
240 private boolean fixVersionFile = false;
241 private boolean fixSplitParents = false;
242 private boolean fixReferenceFiles = false;
243 private boolean fixEmptyMetaCells = false;
244 private boolean fixTableLocks = false;
245 private boolean fixTableZNodes = false;
246 private boolean fixAny = false;
247
248
249
250 private Set<TableName> tablesIncluded = new HashSet<TableName>();
251 private int maxMerge = DEFAULT_MAX_MERGE;
252 private int maxOverlapsToSideline = DEFAULT_OVERLAPS_TO_SIDELINE;
253 private boolean sidelineBigOverlaps = false;
254 private Path sidelineDir = null;
255
256 private boolean rerun = false;
257 private static boolean summary = false;
258 private boolean checkMetaOnly = false;
259 private boolean checkRegionBoundaries = false;
260 private boolean ignorePreCheckPermission = false;
261
262
263
264
265 final private ErrorReporter errors;
266 int fixes = 0;
267
268
269
270
271
272
273 private TreeMap<String, HbckInfo> regionInfoMap = new TreeMap<String, HbckInfo>();
274 private TreeSet<TableName> disabledTables =
275 new TreeSet<TableName>();
276
277 private Set<Result> emptyRegionInfoQualifiers = new HashSet<Result>();
278
279
280
281
282
283
284
285
286
287
288
289 private SortedMap<TableName, TableInfo> tablesInfo =
290 new ConcurrentSkipListMap<TableName, TableInfo>();
291
292
293
294
295 private List<HbckInfo> orphanHdfsDirs = Collections.synchronizedList(new ArrayList<HbckInfo>());
296
297 private Map<TableName, Set<String>> orphanTableDirs =
298 new HashMap<TableName, Set<String>>();
299
300
301
302
303
304 private Set<TableName> orphanedTableZNodes = new HashSet<TableName>();
305 private final RetryCounterFactory lockFileRetryCounterFactory;
306
307
308
309
310
311
312
313
314
315 public HBaseFsck(Configuration conf) throws MasterNotRunningException,
316 ZooKeeperConnectionException, IOException, ClassNotFoundException {
317 super(conf);
318
319 setConf(HBaseConfiguration.create(getConf()));
320
321 getConf().setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0);
322
323 getConf().setBoolean(HConstants.USE_META_REPLICAS, false);
324 errors = getErrorReporter(conf);
325
326 int numThreads = conf.getInt("hbasefsck.numthreads", MAX_NUM_THREADS);
327 executor = new ScheduledThreadPoolExecutor(numThreads, Threads.newDaemonThreadFactory("hbasefsck"));
328 lockFileRetryCounterFactory = new RetryCounterFactory(
329 getConf().getInt("hbase.hbck.lockfile.attempts", DEFAULT_MAX_LOCK_FILE_ATTEMPTS),
330 getConf().getInt("hbase.hbck.lockfile.attempt.sleep.interval",
331 DEFAULT_LOCK_FILE_ATTEMPT_SLEEP_INTERVAL));
332 }
333
334
335
336
337
338
339
340
341
342
343
344 public HBaseFsck(Configuration conf, ExecutorService exec) throws MasterNotRunningException,
345 ZooKeeperConnectionException, IOException, ClassNotFoundException {
346 super(conf);
347 errors = getErrorReporter(getConf());
348 this.executor = exec;
349 lockFileRetryCounterFactory = new RetryCounterFactory(
350 getConf().getInt("hbase.hbck.lockfile.attempts", DEFAULT_MAX_LOCK_FILE_ATTEMPTS),
351 getConf().getInt("hbase.hbck.lockfile.attempt.sleep.interval", DEFAULT_LOCK_FILE_ATTEMPT_SLEEP_INTERVAL));
352 }
353
354 private class FileLockCallable implements Callable<FSDataOutputStream> {
355 RetryCounter retryCounter;
356
357 public FileLockCallable(RetryCounter retryCounter) {
358 this.retryCounter = retryCounter;
359 }
360 @Override
361 public FSDataOutputStream call() throws IOException {
362 try {
363 FileSystem fs = FSUtils.getCurrentFileSystem(getConf());
364 FsPermission defaultPerms = FSUtils.getFilePermissions(fs, getConf(),
365 HConstants.DATA_FILE_UMASK_KEY);
366 Path tmpDir = new Path(FSUtils.getRootDir(getConf()), HConstants.HBASE_TEMP_DIRECTORY);
367 fs.mkdirs(tmpDir);
368 HBCK_LOCK_PATH = new Path(tmpDir, HBCK_LOCK_FILE);
369 final FSDataOutputStream out = createFileWithRetries(fs, HBCK_LOCK_PATH, defaultPerms);
370 out.writeBytes(InetAddress.getLocalHost().toString());
371 out.flush();
372 return out;
373 } catch(RemoteException e) {
374 if(AlreadyBeingCreatedException.class.getName().equals(e.getClassName())){
375 return null;
376 } else {
377 throw e;
378 }
379 }
380 }
381
382 private FSDataOutputStream createFileWithRetries(final FileSystem fs,
383 final Path hbckLockFilePath, final FsPermission defaultPerms)
384 throws IOException {
385
386 IOException exception = null;
387 do {
388 try {
389 return FSUtils.create(fs, hbckLockFilePath, defaultPerms, false);
390 } catch (IOException ioe) {
391 LOG.info("Failed to create lock file " + hbckLockFilePath.getName()
392 + ", try=" + (retryCounter.getAttemptTimes() + 1) + " of "
393 + retryCounter.getMaxAttempts());
394 LOG.debug("Failed to create lock file " + hbckLockFilePath.getName(),
395 ioe);
396 try {
397 exception = ioe;
398 retryCounter.sleepUntilNextRetry();
399 } catch (InterruptedException ie) {
400 throw (InterruptedIOException) new InterruptedIOException(
401 "Can't create lock file " + hbckLockFilePath.getName())
402 .initCause(ie);
403 }
404 }
405 } while (retryCounter.shouldRetry());
406
407 throw exception;
408 }
409 }
410
411
412
413
414
415
416
417 private FSDataOutputStream checkAndMarkRunningHbck() throws IOException {
418 RetryCounter retryCounter = lockFileRetryCounterFactory.create();
419 FileLockCallable callable = new FileLockCallable(retryCounter);
420 ExecutorService executor = Executors.newFixedThreadPool(1);
421 FutureTask<FSDataOutputStream> futureTask = new FutureTask<FSDataOutputStream>(callable);
422 executor.execute(futureTask);
423 final int timeoutInSeconds = 30;
424 FSDataOutputStream stream = null;
425 try {
426 stream = futureTask.get(30, TimeUnit.SECONDS);
427 } catch (ExecutionException ee) {
428 LOG.warn("Encountered exception when opening lock file", ee);
429 } catch (InterruptedException ie) {
430 LOG.warn("Interrupted when opening lock file", ie);
431 Thread.currentThread().interrupt();
432 } catch (TimeoutException exception) {
433
434 LOG.warn("Took more than " + timeoutInSeconds + " seconds in obtaining lock");
435 futureTask.cancel(true);
436 } finally {
437 executor.shutdownNow();
438 }
439 return stream;
440 }
441
442 private void unlockHbck() {
443 if (hbckLockCleanup.compareAndSet(true, false)) {
444 RetryCounter retryCounter = lockFileRetryCounterFactory.create();
445 do {
446 try {
447 IOUtils.closeStream(hbckOutFd);
448 FSUtils.delete(FSUtils.getCurrentFileSystem(getConf()),
449 HBCK_LOCK_PATH, true);
450 return;
451 } catch (IOException ioe) {
452 LOG.info("Failed to delete " + HBCK_LOCK_PATH + ", try="
453 + (retryCounter.getAttemptTimes() + 1) + " of "
454 + retryCounter.getMaxAttempts());
455 LOG.debug("Failed to delete " + HBCK_LOCK_PATH, ioe);
456 try {
457 retryCounter.sleepUntilNextRetry();
458 } catch (InterruptedException ie) {
459 Thread.currentThread().interrupt();
460 LOG.warn("Interrupted while deleting lock file" +
461 HBCK_LOCK_PATH);
462 return;
463 }
464 }
465 } while (retryCounter.shouldRetry());
466
467 }
468 }
469
470
471
472
473
474 public void connect() throws IOException {
475
476
477 hbckOutFd = checkAndMarkRunningHbck();
478 if (hbckOutFd == null) {
479 setRetCode(-1);
480 LOG.error("Another instance of hbck is running, exiting this instance.[If you are sure" +
481 " no other instance is running, delete the lock file " +
482 HBCK_LOCK_PATH + " and rerun the tool]");
483 throw new IOException("Duplicate hbck - Abort");
484 }
485
486
487 hbckLockCleanup.set(true);
488
489
490
491
492 Runtime.getRuntime().addShutdownHook(new Thread() {
493 @Override
494 public void run() {
495 IOUtils.closeStream(HBaseFsck.this);
496 unlockHbck();
497 }
498 });
499 LOG.debug("Launching hbck");
500
501 connection = (ClusterConnection)ConnectionFactory.createConnection(getConf());
502 admin = connection.getAdmin();
503 meta = connection.getTable(TableName.META_TABLE_NAME);
504 status = admin.getClusterStatus();
505 }
506
507
508
509
510 private void loadDeployedRegions() throws IOException, InterruptedException {
511
512 Collection<ServerName> regionServers = status.getServers();
513 errors.print("Number of live region servers: " + regionServers.size());
514 if (details) {
515 for (ServerName rsinfo: regionServers) {
516 errors.print(" " + rsinfo.getServerName());
517 }
518 }
519
520
521 Collection<ServerName> deadRegionServers = status.getDeadServerNames();
522 errors.print("Number of dead region servers: " + deadRegionServers.size());
523 if (details) {
524 for (ServerName name: deadRegionServers) {
525 errors.print(" " + name);
526 }
527 }
528
529
530 errors.print("Master: " + status.getMaster());
531
532
533 Collection<ServerName> backupMasters = status.getBackupMasters();
534 errors.print("Number of backup masters: " + backupMasters.size());
535 if (details) {
536 for (ServerName name: backupMasters) {
537 errors.print(" " + name);
538 }
539 }
540
541 errors.print("Average load: " + status.getAverageLoad());
542 errors.print("Number of requests: " + status.getRequestsCount());
543 errors.print("Number of regions: " + status.getRegionsCount());
544
545 Map<String, RegionState> rits = status.getRegionsInTransition();
546 errors.print("Number of regions in transition: " + rits.size());
547 if (details) {
548 for (RegionState state: rits.values()) {
549 errors.print(" " + state.toDescriptiveString());
550 }
551 }
552
553
554 processRegionServers(regionServers);
555 }
556
557
558
559
560 private void clearState() {
561
562 fixes = 0;
563 regionInfoMap.clear();
564 emptyRegionInfoQualifiers.clear();
565 disabledTables.clear();
566 errors.clear();
567 tablesInfo.clear();
568 orphanHdfsDirs.clear();
569 }
570
571
572
573
574
575
576 public void offlineHdfsIntegrityRepair() throws IOException, InterruptedException {
577
578 if (shouldCheckHdfs() && (shouldFixHdfsOrphans() || shouldFixHdfsHoles()
579 || shouldFixHdfsOverlaps() || shouldFixTableOrphans())) {
580 LOG.info("Loading regioninfos HDFS");
581
582 int maxIterations = getConf().getInt("hbase.hbck.integrityrepair.iterations.max", 3);
583 int curIter = 0;
584 do {
585 clearState();
586
587 restoreHdfsIntegrity();
588 curIter++;
589 } while (fixes > 0 && curIter <= maxIterations);
590
591
592
593 if (curIter > 2) {
594 if (curIter == maxIterations) {
595 LOG.warn("Exiting integrity repairs after max " + curIter + " iterations. "
596 + "Tables integrity may not be fully repaired!");
597 } else {
598 LOG.info("Successfully exiting integrity repairs after " + curIter + " iterations");
599 }
600 }
601 }
602 }
603
604
605
606
607
608
609
610
611
612 public int onlineConsistencyRepair() throws IOException, KeeperException,
613 InterruptedException {
614 clearState();
615
616
617 loadDeployedRegions();
618
619 recordMetaRegion();
620
621 if (!checkMetaRegion()) {
622 String errorMsg = "hbase:meta table is not consistent. ";
623 if (shouldFixAssignments()) {
624 errorMsg += "HBCK will try fixing it. Rerun once hbase:meta is back to consistent state.";
625 } else {
626 errorMsg += "Run HBCK with proper fix options to fix hbase:meta inconsistency.";
627 }
628 errors.reportError(errorMsg + " Exiting...");
629 return -2;
630 }
631
632 LOG.info("Loading regionsinfo from the hbase:meta table");
633 boolean success = loadMetaEntries();
634 if (!success) return -1;
635
636
637 reportEmptyMetaCells();
638
639
640 if (shouldFixEmptyMetaCells()) {
641 fixEmptyMetaCells();
642 }
643
644
645 if (!checkMetaOnly) {
646 reportTablesInFlux();
647 }
648
649
650 if (shouldCheckHdfs()) {
651 LOG.info("Loading region directories from HDFS");
652 loadHdfsRegionDirs();
653 LOG.info("Loading region information from HDFS");
654 loadHdfsRegionInfos();
655 }
656
657
658 loadDisabledTables();
659
660
661 fixOrphanTables();
662
663 LOG.info("Checking and fixing region consistency");
664
665
666 checkAndFixConsistency();
667
668
669 checkIntegrity();
670 return errors.getErrorList().size();
671 }
672
673
674
675
676
677 public int onlineHbck() throws IOException, KeeperException, InterruptedException, ServiceException {
678
679 errors.print("Version: " + status.getHBaseVersion());
680 offlineHdfsIntegrityRepair();
681
682
683 boolean oldBalancer = admin.setBalancerRunning(false, true);
684 try {
685 onlineConsistencyRepair();
686 }
687 finally {
688 admin.setBalancerRunning(oldBalancer, false);
689 }
690
691 if (checkRegionBoundaries) {
692 checkRegionBoundaries();
693 }
694
695 offlineReferenceFileRepair();
696
697 checkAndFixTableLocks();
698
699
700 checkAndFixOrphanedTableZNodes();
701
702
703 unlockHbck();
704
705
706 printTableSummary(tablesInfo);
707 return errors.summarize();
708 }
709
710 public static byte[] keyOnly (byte[] b) {
711 if (b == null)
712 return b;
713 int rowlength = Bytes.toShort(b, 0);
714 byte[] result = new byte[rowlength];
715 System.arraycopy(b, Bytes.SIZEOF_SHORT, result, 0, rowlength);
716 return result;
717 }
718
719 @Override
720 public void close() throws IOException {
721 IOUtils.cleanup(null, admin, meta, connection);
722 }
723
724 private static class RegionBoundariesInformation {
725 public byte [] regionName;
726 public byte [] metaFirstKey;
727 public byte [] metaLastKey;
728 public byte [] storesFirstKey;
729 public byte [] storesLastKey;
730 @Override
731 public String toString () {
732 return "regionName=" + Bytes.toStringBinary(regionName) +
733 "\nmetaFirstKey=" + Bytes.toStringBinary(metaFirstKey) +
734 "\nmetaLastKey=" + Bytes.toStringBinary(metaLastKey) +
735 "\nstoresFirstKey=" + Bytes.toStringBinary(storesFirstKey) +
736 "\nstoresLastKey=" + Bytes.toStringBinary(storesLastKey);
737 }
738 }
739
740 public void checkRegionBoundaries() {
741 try {
742 ByteArrayComparator comparator = new ByteArrayComparator();
743 List<HRegionInfo> regions = MetaScanner.listAllRegions(getConf(), connection, false);
744 final RegionBoundariesInformation currentRegionBoundariesInformation =
745 new RegionBoundariesInformation();
746 Path hbaseRoot = FSUtils.getRootDir(getConf());
747 for (HRegionInfo regionInfo : regions) {
748 Path tableDir = FSUtils.getTableDir(hbaseRoot, regionInfo.getTable());
749 currentRegionBoundariesInformation.regionName = regionInfo.getRegionName();
750
751
752 Path path = new Path(tableDir, regionInfo.getEncodedName());
753 FileSystem fs = path.getFileSystem(getConf());
754 FileStatus[] files = fs.listStatus(path);
755
756 byte[] storeFirstKey = null;
757 byte[] storeLastKey = null;
758 for (FileStatus file : files) {
759 String fileName = file.getPath().toString();
760 fileName = fileName.substring(fileName.lastIndexOf("/") + 1);
761 if (!fileName.startsWith(".") && !fileName.endsWith("recovered.edits")) {
762 FileStatus[] storeFiles = fs.listStatus(file.getPath());
763
764 for (FileStatus storeFile : storeFiles) {
765 HFile.Reader reader = HFile.createReader(fs, storeFile.getPath(), new CacheConfig(
766 getConf()), getConf());
767 if ((reader.getFirstKey() != null)
768 && ((storeFirstKey == null) || (comparator.compare(storeFirstKey,
769 reader.getFirstKey()) > 0))) {
770 storeFirstKey = reader.getFirstKey();
771 }
772 if ((reader.getLastKey() != null)
773 && ((storeLastKey == null) || (comparator.compare(storeLastKey,
774 reader.getLastKey())) < 0)) {
775 storeLastKey = reader.getLastKey();
776 }
777 reader.close();
778 }
779 }
780 }
781 currentRegionBoundariesInformation.metaFirstKey = regionInfo.getStartKey();
782 currentRegionBoundariesInformation.metaLastKey = regionInfo.getEndKey();
783 currentRegionBoundariesInformation.storesFirstKey = keyOnly(storeFirstKey);
784 currentRegionBoundariesInformation.storesLastKey = keyOnly(storeLastKey);
785 if (currentRegionBoundariesInformation.metaFirstKey.length == 0)
786 currentRegionBoundariesInformation.metaFirstKey = null;
787 if (currentRegionBoundariesInformation.metaLastKey.length == 0)
788 currentRegionBoundariesInformation.metaLastKey = null;
789
790
791
792
793
794
795 boolean valid = true;
796
797 if ((currentRegionBoundariesInformation.storesFirstKey != null)
798 && (currentRegionBoundariesInformation.metaFirstKey != null)) {
799 valid = valid
800 && comparator.compare(currentRegionBoundariesInformation.storesFirstKey,
801 currentRegionBoundariesInformation.metaFirstKey) >= 0;
802 }
803
804 if ((currentRegionBoundariesInformation.storesLastKey != null)
805 && (currentRegionBoundariesInformation.metaLastKey != null)) {
806 valid = valid
807 && comparator.compare(currentRegionBoundariesInformation.storesLastKey,
808 currentRegionBoundariesInformation.metaLastKey) < 0;
809 }
810 if (!valid) {
811 errors.reportError(ERROR_CODE.BOUNDARIES_ERROR, "Found issues with regions boundaries",
812 tablesInfo.get(regionInfo.getTable()));
813 LOG.warn("Region's boundaries not alligned between stores and META for:");
814 LOG.warn(currentRegionBoundariesInformation);
815 }
816 }
817 } catch (IOException e) {
818 LOG.error(e);
819 }
820 }
821
822
823
824
825 private void adoptHdfsOrphans(Collection<HbckInfo> orphanHdfsDirs) throws IOException {
826 for (HbckInfo hi : orphanHdfsDirs) {
827 LOG.info("Attempting to handle orphan hdfs dir: " + hi.getHdfsRegionDir());
828 adoptHdfsOrphan(hi);
829 }
830 }
831
832
833
834
835
836
837
838
839
840
841 @SuppressWarnings("deprecation")
842 private void adoptHdfsOrphan(HbckInfo hi) throws IOException {
843 Path p = hi.getHdfsRegionDir();
844 FileSystem fs = p.getFileSystem(getConf());
845 FileStatus[] dirs = fs.listStatus(p);
846 if (dirs == null) {
847 LOG.warn("Attempt to adopt ophan hdfs region skipped becuase no files present in " +
848 p + ". This dir could probably be deleted.");
849 return ;
850 }
851
852 TableName tableName = hi.getTableName();
853 TableInfo tableInfo = tablesInfo.get(tableName);
854 Preconditions.checkNotNull(tableInfo, "Table '" + tableName + "' not present!");
855 HTableDescriptor template = tableInfo.getHTD();
856
857
858 Pair<byte[],byte[]> orphanRegionRange = null;
859 for (FileStatus cf : dirs) {
860 String cfName= cf.getPath().getName();
861
862 if (cfName.startsWith(".") || cfName.equals(HConstants.SPLIT_LOGDIR_NAME)) continue;
863
864 FileStatus[] hfiles = fs.listStatus(cf.getPath());
865 for (FileStatus hfile : hfiles) {
866 byte[] start, end;
867 HFile.Reader hf = null;
868 try {
869 CacheConfig cacheConf = new CacheConfig(getConf());
870 hf = HFile.createReader(fs, hfile.getPath(), cacheConf, getConf());
871 hf.loadFileInfo();
872 KeyValue startKv = KeyValue.createKeyValueFromKey(hf.getFirstKey());
873 start = startKv.getRow();
874 KeyValue endKv = KeyValue.createKeyValueFromKey(hf.getLastKey());
875 end = endKv.getRow();
876 } catch (IOException ioe) {
877 LOG.warn("Problem reading orphan file " + hfile + ", skipping");
878 continue;
879 } catch (NullPointerException ioe) {
880 LOG.warn("Orphan file " + hfile + " is possibly corrupted HFile, skipping");
881 continue;
882 } finally {
883 if (hf != null) {
884 hf.close();
885 }
886 }
887
888
889 if (orphanRegionRange == null) {
890
891 orphanRegionRange = new Pair<byte[], byte[]>(start, end);
892 } else {
893
894
895
896 if (Bytes.compareTo(orphanRegionRange.getFirst(), start) > 0) {
897 orphanRegionRange.setFirst(start);
898 }
899 if (Bytes.compareTo(orphanRegionRange.getSecond(), end) < 0 ) {
900 orphanRegionRange.setSecond(end);
901 }
902 }
903 }
904 }
905 if (orphanRegionRange == null) {
906 LOG.warn("No data in dir " + p + ", sidelining data");
907 fixes++;
908 sidelineRegionDir(fs, hi);
909 return;
910 }
911 LOG.info("Min max keys are : [" + Bytes.toString(orphanRegionRange.getFirst()) + ", " +
912 Bytes.toString(orphanRegionRange.getSecond()) + ")");
913
914
915 HRegionInfo hri = new HRegionInfo(template.getTableName(), orphanRegionRange.getFirst(), orphanRegionRange.getSecond());
916 LOG.info("Creating new region : " + hri);
917 HRegion region = HBaseFsckRepair.createHDFSRegionDir(getConf(), hri, template);
918 Path target = region.getRegionFileSystem().getRegionDir();
919
920
921 mergeRegionDirs(target, hi);
922 fixes++;
923 }
924
925
926
927
928
929
930
931
932
933 private int restoreHdfsIntegrity() throws IOException, InterruptedException {
934
935 LOG.info("Loading HBase regioninfo from HDFS...");
936 loadHdfsRegionDirs();
937
938 int errs = errors.getErrorList().size();
939
940 tablesInfo = loadHdfsRegionInfos();
941 checkHdfsIntegrity(false, false);
942
943 if (errors.getErrorList().size() == errs) {
944 LOG.info("No integrity errors. We are done with this phase. Glorious.");
945 return 0;
946 }
947
948 if (shouldFixHdfsOrphans() && orphanHdfsDirs.size() > 0) {
949 adoptHdfsOrphans(orphanHdfsDirs);
950
951 }
952
953
954 if (shouldFixHdfsHoles()) {
955 clearState();
956 loadHdfsRegionDirs();
957 tablesInfo = loadHdfsRegionInfos();
958 tablesInfo = checkHdfsIntegrity(shouldFixHdfsHoles(), false);
959 }
960
961
962 if (shouldFixHdfsOverlaps()) {
963
964 clearState();
965 loadHdfsRegionDirs();
966 tablesInfo = loadHdfsRegionInfos();
967 tablesInfo = checkHdfsIntegrity(false, shouldFixHdfsOverlaps());
968 }
969
970 return errors.getErrorList().size();
971 }
972
973
974
975
976
977
978
979
980
981 private void offlineReferenceFileRepair() throws IOException {
982 Configuration conf = getConf();
983 Path hbaseRoot = FSUtils.getRootDir(conf);
984 FileSystem fs = hbaseRoot.getFileSystem(conf);
985 LOG.info("Computing mapping of all store files");
986 Map<String, Path> allFiles = FSUtils.getTableStoreFilePathMap(fs, hbaseRoot, errors);
987 errors.print("");
988 LOG.info("Validating mapping using HDFS state");
989 for (Path path: allFiles.values()) {
990 boolean isReference = false;
991 try {
992 isReference = StoreFileInfo.isReference(path);
993 } catch (Throwable t) {
994
995
996
997
998 }
999 if (!isReference) continue;
1000
1001 Path referredToFile = StoreFileInfo.getReferredToFile(path);
1002 if (fs.exists(referredToFile)) continue;
1003
1004
1005 errors.reportError(ERROR_CODE.LINGERING_REFERENCE_HFILE,
1006 "Found lingering reference file " + path);
1007 if (!shouldFixReferenceFiles()) continue;
1008
1009
1010 boolean success = false;
1011 String pathStr = path.toString();
1012
1013
1014
1015
1016
1017 int index = pathStr.lastIndexOf(Path.SEPARATOR_CHAR);
1018 for (int i = 0; index > 0 && i < 5; i++) {
1019 index = pathStr.lastIndexOf(Path.SEPARATOR_CHAR, index - 1);
1020 }
1021 if (index > 0) {
1022 Path rootDir = getSidelineDir();
1023 Path dst = new Path(rootDir, pathStr.substring(index + 1));
1024 fs.mkdirs(dst.getParent());
1025 LOG.info("Trying to sildeline reference file "
1026 + path + " to " + dst);
1027 setShouldRerun();
1028
1029 success = fs.rename(path, dst);
1030 }
1031 if (!success) {
1032 LOG.error("Failed to sideline reference file " + path);
1033 }
1034 }
1035 }
1036
1037
1038
1039
1040 private void reportEmptyMetaCells() {
1041 errors.print("Number of empty REGIONINFO_QUALIFIER rows in hbase:meta: " +
1042 emptyRegionInfoQualifiers.size());
1043 if (details) {
1044 for (Result r: emptyRegionInfoQualifiers) {
1045 errors.print(" " + r);
1046 }
1047 }
1048 }
1049
1050
1051
1052
1053 private void reportTablesInFlux() {
1054 AtomicInteger numSkipped = new AtomicInteger(0);
1055 HTableDescriptor[] allTables = getTables(numSkipped);
1056 errors.print("Number of Tables: " + allTables.length);
1057 if (details) {
1058 if (numSkipped.get() > 0) {
1059 errors.detail("Number of Tables in flux: " + numSkipped.get());
1060 }
1061 for (HTableDescriptor td : allTables) {
1062 errors.detail(" Table: " + td.getTableName() + "\t" +
1063 (td.isReadOnly() ? "ro" : "rw") + "\t" +
1064 (td.isMetaRegion() ? "META" : " ") + "\t" +
1065 " families: " + td.getFamilies().size());
1066 }
1067 }
1068 }
1069
1070 public ErrorReporter getErrors() {
1071 return errors;
1072 }
1073
1074
1075
1076
1077
1078 private void loadHdfsRegioninfo(HbckInfo hbi) throws IOException {
1079 Path regionDir = hbi.getHdfsRegionDir();
1080 if (regionDir == null) {
1081 LOG.warn("No HDFS region dir found: " + hbi + " meta=" + hbi.metaEntry);
1082 return;
1083 }
1084
1085 if (hbi.hdfsEntry.hri != null) {
1086
1087 return;
1088 }
1089
1090 FileSystem fs = FileSystem.get(getConf());
1091 HRegionInfo hri = HRegionFileSystem.loadRegionInfoFileContent(fs, regionDir);
1092 LOG.debug("HRegionInfo read: " + hri.toString());
1093 hbi.hdfsEntry.hri = hri;
1094 }
1095
1096
1097
1098
1099
1100 public static class RegionRepairException extends IOException {
1101 private static final long serialVersionUID = 1L;
1102 final IOException ioe;
1103 public RegionRepairException(String s, IOException ioe) {
1104 super(s);
1105 this.ioe = ioe;
1106 }
1107 }
1108
1109
1110
1111
1112 private SortedMap<TableName, TableInfo> loadHdfsRegionInfos()
1113 throws IOException, InterruptedException {
1114 tablesInfo.clear();
1115
1116 Collection<HbckInfo> hbckInfos = regionInfoMap.values();
1117
1118
1119 List<WorkItemHdfsRegionInfo> hbis = new ArrayList<WorkItemHdfsRegionInfo>(hbckInfos.size());
1120 List<Future<Void>> hbiFutures;
1121
1122 for (HbckInfo hbi : hbckInfos) {
1123 WorkItemHdfsRegionInfo work = new WorkItemHdfsRegionInfo(hbi, this, errors);
1124 hbis.add(work);
1125 }
1126
1127
1128 hbiFutures = executor.invokeAll(hbis);
1129
1130 for(int i=0; i<hbiFutures.size(); i++) {
1131 WorkItemHdfsRegionInfo work = hbis.get(i);
1132 Future<Void> f = hbiFutures.get(i);
1133 try {
1134 f.get();
1135 } catch(ExecutionException e) {
1136 LOG.warn("Failed to read .regioninfo file for region " +
1137 work.hbi.getRegionNameAsString(), e.getCause());
1138 }
1139 }
1140
1141 Path hbaseRoot = FSUtils.getRootDir(getConf());
1142 FileSystem fs = hbaseRoot.getFileSystem(getConf());
1143
1144 for (HbckInfo hbi: hbckInfos) {
1145
1146 if (hbi.getHdfsHRI() == null) {
1147
1148 continue;
1149 }
1150
1151
1152
1153 TableName tableName = hbi.getTableName();
1154 if (tableName == null) {
1155
1156 LOG.warn("tableName was null for: " + hbi);
1157 continue;
1158 }
1159
1160 TableInfo modTInfo = tablesInfo.get(tableName);
1161 if (modTInfo == null) {
1162
1163 modTInfo = new TableInfo(tableName);
1164 tablesInfo.put(tableName, modTInfo);
1165 try {
1166 HTableDescriptor htd =
1167 FSTableDescriptors.getTableDescriptorFromFs(fs, hbaseRoot, tableName);
1168 modTInfo.htds.add(htd);
1169 } catch (IOException ioe) {
1170 if (!orphanTableDirs.containsKey(tableName)) {
1171 LOG.warn("Unable to read .tableinfo from " + hbaseRoot, ioe);
1172
1173 errors.reportError(ERROR_CODE.NO_TABLEINFO_FILE,
1174 "Unable to read .tableinfo from " + hbaseRoot + "/" + tableName);
1175 Set<String> columns = new HashSet<String>();
1176 orphanTableDirs.put(tableName, getColumnFamilyList(columns, hbi));
1177 }
1178 }
1179 }
1180 if (!hbi.isSkipChecks()) {
1181 modTInfo.addRegionInfo(hbi);
1182 }
1183 }
1184
1185 loadTableInfosForTablesWithNoRegion();
1186 errors.print("");
1187
1188 return tablesInfo;
1189 }
1190
1191
1192
1193
1194
1195
1196
1197
1198 private Set<String> getColumnFamilyList(Set<String> columns, HbckInfo hbi) throws IOException {
1199 Path regionDir = hbi.getHdfsRegionDir();
1200 FileSystem fs = regionDir.getFileSystem(getConf());
1201 FileStatus[] subDirs = fs.listStatus(regionDir, new FSUtils.FamilyDirFilter(fs));
1202 for (FileStatus subdir : subDirs) {
1203 String columnfamily = subdir.getPath().getName();
1204 columns.add(columnfamily);
1205 }
1206 return columns;
1207 }
1208
1209
1210
1211
1212
1213
1214
1215
1216 private boolean fabricateTableInfo(FSTableDescriptors fstd, TableName tableName,
1217 Set<String> columns) throws IOException {
1218 if (columns ==null || columns.isEmpty()) return false;
1219 HTableDescriptor htd = new HTableDescriptor(tableName);
1220 for (String columnfamimly : columns) {
1221 htd.addFamily(new HColumnDescriptor(columnfamimly));
1222 }
1223 fstd.createTableDescriptor(htd, true);
1224 return true;
1225 }
1226
1227
1228
1229
1230
1231 public void fixEmptyMetaCells() throws IOException {
1232 if (shouldFixEmptyMetaCells() && !emptyRegionInfoQualifiers.isEmpty()) {
1233 LOG.info("Trying to fix empty REGIONINFO_QUALIFIER hbase:meta rows.");
1234 for (Result region : emptyRegionInfoQualifiers) {
1235 deleteMetaRegion(region.getRow());
1236 errors.getErrorList().remove(ERROR_CODE.EMPTY_META_CELL);
1237 }
1238 emptyRegionInfoQualifiers.clear();
1239 }
1240 }
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251 public void fixOrphanTables() throws IOException {
1252 if (shouldFixTableOrphans() && !orphanTableDirs.isEmpty()) {
1253
1254 List<TableName> tmpList = new ArrayList<TableName>();
1255 tmpList.addAll(orphanTableDirs.keySet());
1256 HTableDescriptor[] htds = getHTableDescriptors(tmpList);
1257 Iterator<Entry<TableName, Set<String>>> iter =
1258 orphanTableDirs.entrySet().iterator();
1259 int j = 0;
1260 int numFailedCase = 0;
1261 FSTableDescriptors fstd = new FSTableDescriptors(getConf());
1262 while (iter.hasNext()) {
1263 Entry<TableName, Set<String>> entry =
1264 iter.next();
1265 TableName tableName = entry.getKey();
1266 LOG.info("Trying to fix orphan table error: " + tableName);
1267 if (j < htds.length) {
1268 if (tableName.equals(htds[j].getTableName())) {
1269 HTableDescriptor htd = htds[j];
1270 LOG.info("fixing orphan table: " + tableName + " from cache");
1271 fstd.createTableDescriptor(htd, true);
1272 j++;
1273 iter.remove();
1274 }
1275 } else {
1276 if (fabricateTableInfo(fstd, tableName, entry.getValue())) {
1277 LOG.warn("fixing orphan table: " + tableName + " with a default .tableinfo file");
1278 LOG.warn("Strongly recommend to modify the HTableDescriptor if necessary for: " + tableName);
1279 iter.remove();
1280 } else {
1281 LOG.error("Unable to create default .tableinfo for " + tableName + " while missing column family information");
1282 numFailedCase++;
1283 }
1284 }
1285 fixes++;
1286 }
1287
1288 if (orphanTableDirs.isEmpty()) {
1289
1290
1291 setShouldRerun();
1292 LOG.warn("Strongly recommend to re-run manually hfsck after all orphanTableDirs being fixed");
1293 } else if (numFailedCase > 0) {
1294 LOG.error("Failed to fix " + numFailedCase
1295 + " OrphanTables with default .tableinfo files");
1296 }
1297
1298 }
1299
1300 orphanTableDirs.clear();
1301
1302 }
1303
1304
1305
1306
1307
1308
1309 private HRegion createNewMeta() throws IOException {
1310 Path rootdir = FSUtils.getRootDir(getConf());
1311 Configuration c = getConf();
1312 HRegionInfo metaHRI = new HRegionInfo(HRegionInfo.FIRST_META_REGIONINFO);
1313 HTableDescriptor metaDescriptor = new FSTableDescriptors(c).get(TableName.META_TABLE_NAME);
1314 MasterFileSystem.setInfoFamilyCachingForMeta(metaDescriptor, false);
1315 HRegion meta = HRegion.createHRegion(metaHRI, rootdir, c, metaDescriptor);
1316 MasterFileSystem.setInfoFamilyCachingForMeta(metaDescriptor, true);
1317 return meta;
1318 }
1319
1320
1321
1322
1323
1324
1325
1326 private ArrayList<Put> generatePuts(
1327 SortedMap<TableName, TableInfo> tablesInfo) throws IOException {
1328 ArrayList<Put> puts = new ArrayList<Put>();
1329 boolean hasProblems = false;
1330 for (Entry<TableName, TableInfo> e : tablesInfo.entrySet()) {
1331 TableName name = e.getKey();
1332
1333
1334 if (name.compareTo(TableName.META_TABLE_NAME) == 0) {
1335 continue;
1336 }
1337
1338 TableInfo ti = e.getValue();
1339 for (Entry<byte[], Collection<HbckInfo>> spl : ti.sc.getStarts().asMap()
1340 .entrySet()) {
1341 Collection<HbckInfo> his = spl.getValue();
1342 int sz = his.size();
1343 if (sz != 1) {
1344
1345 LOG.error("Split starting at " + Bytes.toStringBinary(spl.getKey())
1346 + " had " + sz + " regions instead of exactly 1." );
1347 hasProblems = true;
1348 continue;
1349 }
1350
1351
1352 HbckInfo hi = his.iterator().next();
1353 HRegionInfo hri = hi.getHdfsHRI();
1354 Put p = MetaTableAccessor.makePutFromRegionInfo(hri);
1355 puts.add(p);
1356 }
1357 }
1358 return hasProblems ? null : puts;
1359 }
1360
1361
1362
1363
1364 private void suggestFixes(
1365 SortedMap<TableName, TableInfo> tablesInfo) throws IOException {
1366 logParallelMerge();
1367 for (TableInfo tInfo : tablesInfo.values()) {
1368 TableIntegrityErrorHandler handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
1369 tInfo.checkRegionChain(handler);
1370 }
1371 }
1372
1373
1374
1375
1376
1377
1378
1379
1380 public boolean rebuildMeta(boolean fix) throws IOException,
1381 InterruptedException {
1382
1383
1384
1385
1386
1387 LOG.info("Loading HBase regioninfo from HDFS...");
1388 loadHdfsRegionDirs();
1389
1390 int errs = errors.getErrorList().size();
1391 tablesInfo = loadHdfsRegionInfos();
1392 checkHdfsIntegrity(false, false);
1393
1394
1395 if (errors.getErrorList().size() != errs) {
1396
1397 while(true) {
1398 fixes = 0;
1399 suggestFixes(tablesInfo);
1400 errors.clear();
1401 loadHdfsRegionInfos();
1402 checkHdfsIntegrity(shouldFixHdfsHoles(), shouldFixHdfsOverlaps());
1403
1404 int errCount = errors.getErrorList().size();
1405
1406 if (fixes == 0) {
1407 if (errCount > 0) {
1408 return false;
1409 } else {
1410 break;
1411 }
1412 }
1413 }
1414 }
1415
1416
1417 LOG.info("HDFS regioninfo's seems good. Sidelining old hbase:meta");
1418 Path backupDir = sidelineOldMeta();
1419
1420 LOG.info("Creating new hbase:meta");
1421 HRegion meta = createNewMeta();
1422
1423
1424 List<Put> puts = generatePuts(tablesInfo);
1425 if (puts == null) {
1426 LOG.fatal("Problem encountered when creating new hbase:meta entries. " +
1427 "You may need to restore the previously sidelined hbase:meta");
1428 return false;
1429 }
1430 meta.batchMutate(puts.toArray(new Put[puts.size()]));
1431 HRegion.closeHRegion(meta);
1432 LOG.info("Success! hbase:meta table rebuilt.");
1433 LOG.info("Old hbase:meta is moved into " + backupDir);
1434 return true;
1435 }
1436
1437
1438
1439
1440 private void logParallelMerge() {
1441 if (getConf().getBoolean("hbasefsck.overlap.merge.parallel", true)) {
1442 LOG.info("Handling overlap merges in parallel. set hbasefsck.overlap.merge.parallel to" +
1443 " false to run serially.");
1444 } else {
1445 LOG.info("Handling overlap merges serially. set hbasefsck.overlap.merge.parallel to" +
1446 " true to run in parallel.");
1447 }
1448 }
1449
1450 private SortedMap<TableName, TableInfo> checkHdfsIntegrity(boolean fixHoles,
1451 boolean fixOverlaps) throws IOException {
1452 LOG.info("Checking HBase region split map from HDFS data...");
1453 logParallelMerge();
1454 for (TableInfo tInfo : tablesInfo.values()) {
1455 TableIntegrityErrorHandler handler;
1456 if (fixHoles || fixOverlaps) {
1457 handler = tInfo.new HDFSIntegrityFixer(tInfo, errors, getConf(),
1458 fixHoles, fixOverlaps);
1459 } else {
1460 handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
1461 }
1462 if (!tInfo.checkRegionChain(handler)) {
1463
1464 errors.report("Found inconsistency in table " + tInfo.getName());
1465 }
1466 }
1467 return tablesInfo;
1468 }
1469
1470 private Path getSidelineDir() throws IOException {
1471 if (sidelineDir == null) {
1472 Path hbaseDir = FSUtils.getRootDir(getConf());
1473 Path hbckDir = new Path(hbaseDir, HConstants.HBCK_SIDELINEDIR_NAME);
1474 sidelineDir = new Path(hbckDir, hbaseDir.getName() + "-"
1475 + startMillis);
1476 }
1477 return sidelineDir;
1478 }
1479
1480
1481
1482
1483 Path sidelineRegionDir(FileSystem fs, HbckInfo hi) throws IOException {
1484 return sidelineRegionDir(fs, null, hi);
1485 }
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495 Path sidelineRegionDir(FileSystem fs,
1496 String parentDir, HbckInfo hi) throws IOException {
1497 TableName tableName = hi.getTableName();
1498 Path regionDir = hi.getHdfsRegionDir();
1499
1500 if (!fs.exists(regionDir)) {
1501 LOG.warn("No previous " + regionDir + " exists. Continuing.");
1502 return null;
1503 }
1504
1505 Path rootDir = getSidelineDir();
1506 if (parentDir != null) {
1507 rootDir = new Path(rootDir, parentDir);
1508 }
1509 Path sidelineTableDir= FSUtils.getTableDir(rootDir, tableName);
1510 Path sidelineRegionDir = new Path(sidelineTableDir, regionDir.getName());
1511 fs.mkdirs(sidelineRegionDir);
1512 boolean success = false;
1513 FileStatus[] cfs = fs.listStatus(regionDir);
1514 if (cfs == null) {
1515 LOG.info("Region dir is empty: " + regionDir);
1516 } else {
1517 for (FileStatus cf : cfs) {
1518 Path src = cf.getPath();
1519 Path dst = new Path(sidelineRegionDir, src.getName());
1520 if (fs.isFile(src)) {
1521
1522 success = fs.rename(src, dst);
1523 if (!success) {
1524 String msg = "Unable to rename file " + src + " to " + dst;
1525 LOG.error(msg);
1526 throw new IOException(msg);
1527 }
1528 continue;
1529 }
1530
1531
1532 fs.mkdirs(dst);
1533
1534 LOG.info("Sidelining files from " + src + " into containing region " + dst);
1535
1536
1537
1538
1539 FileStatus[] hfiles = fs.listStatus(src);
1540 if (hfiles != null && hfiles.length > 0) {
1541 for (FileStatus hfile : hfiles) {
1542 success = fs.rename(hfile.getPath(), dst);
1543 if (!success) {
1544 String msg = "Unable to rename file " + src + " to " + dst;
1545 LOG.error(msg);
1546 throw new IOException(msg);
1547 }
1548 }
1549 }
1550 LOG.debug("Sideline directory contents:");
1551 debugLsr(sidelineRegionDir);
1552 }
1553 }
1554
1555 LOG.info("Removing old region dir: " + regionDir);
1556 success = fs.delete(regionDir, true);
1557 if (!success) {
1558 String msg = "Unable to delete dir " + regionDir;
1559 LOG.error(msg);
1560 throw new IOException(msg);
1561 }
1562 return sidelineRegionDir;
1563 }
1564
1565
1566
1567
1568 void sidelineTable(FileSystem fs, TableName tableName, Path hbaseDir,
1569 Path backupHbaseDir) throws IOException {
1570 Path tableDir = FSUtils.getTableDir(hbaseDir, tableName);
1571 if (fs.exists(tableDir)) {
1572 Path backupTableDir= FSUtils.getTableDir(backupHbaseDir, tableName);
1573 fs.mkdirs(backupTableDir.getParent());
1574 boolean success = fs.rename(tableDir, backupTableDir);
1575 if (!success) {
1576 throw new IOException("Failed to move " + tableName + " from "
1577 + tableDir + " to " + backupTableDir);
1578 }
1579 } else {
1580 LOG.info("No previous " + tableName + " exists. Continuing.");
1581 }
1582 }
1583
1584
1585
1586
1587 Path sidelineOldMeta() throws IOException {
1588
1589 Path hbaseDir = FSUtils.getRootDir(getConf());
1590 FileSystem fs = hbaseDir.getFileSystem(getConf());
1591 Path backupDir = getSidelineDir();
1592 fs.mkdirs(backupDir);
1593
1594 try {
1595 sidelineTable(fs, TableName.META_TABLE_NAME, hbaseDir, backupDir);
1596 } catch (IOException e) {
1597 LOG.fatal("... failed to sideline meta. Currently in inconsistent state. To restore "
1598 + "try to rename hbase:meta in " + backupDir.getName() + " to "
1599 + hbaseDir.getName() + ".", e);
1600 throw e;
1601 }
1602 return backupDir;
1603 }
1604
1605
1606
1607
1608
1609
1610 private void loadDisabledTables()
1611 throws ZooKeeperConnectionException, IOException {
1612 HConnectionManager.execute(new HConnectable<Void>(getConf()) {
1613 @Override
1614 public Void connect(HConnection connection) throws IOException {
1615 ZooKeeperWatcher zkw = createZooKeeperWatcher();
1616 try {
1617 for (TableName tableName :
1618 ZKTableStateClientSideReader.getDisabledOrDisablingTables(zkw)) {
1619 disabledTables.add(tableName);
1620 }
1621 } catch (KeeperException ke) {
1622 throw new IOException(ke);
1623 } catch (InterruptedException e) {
1624 throw new InterruptedIOException();
1625 } finally {
1626 zkw.close();
1627 }
1628 return null;
1629 }
1630 });
1631 }
1632
1633
1634
1635
1636 private boolean isTableDisabled(HRegionInfo regionInfo) {
1637 return disabledTables.contains(regionInfo.getTable());
1638 }
1639
1640
1641
1642
1643
1644 public void loadHdfsRegionDirs() throws IOException, InterruptedException {
1645 Path rootDir = FSUtils.getRootDir(getConf());
1646 FileSystem fs = rootDir.getFileSystem(getConf());
1647
1648
1649 List<FileStatus> tableDirs = Lists.newArrayList();
1650
1651 boolean foundVersionFile = fs.exists(new Path(rootDir, HConstants.VERSION_FILE_NAME));
1652
1653 List<Path> paths = FSUtils.getTableDirs(fs, rootDir);
1654 for (Path path : paths) {
1655 TableName tableName = FSUtils.getTableName(path);
1656 if ((!checkMetaOnly &&
1657 isTableIncluded(tableName)) ||
1658 tableName.equals(TableName.META_TABLE_NAME)) {
1659 tableDirs.add(fs.getFileStatus(path));
1660 }
1661 }
1662
1663
1664 if (!foundVersionFile) {
1665 errors.reportError(ERROR_CODE.NO_VERSION_FILE,
1666 "Version file does not exist in root dir " + rootDir);
1667 if (shouldFixVersionFile()) {
1668 LOG.info("Trying to create a new " + HConstants.VERSION_FILE_NAME
1669 + " file.");
1670 setShouldRerun();
1671 FSUtils.setVersion(fs, rootDir, getConf().getInt(
1672 HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000), getConf().getInt(
1673 HConstants.VERSION_FILE_WRITE_ATTEMPTS,
1674 HConstants.DEFAULT_VERSION_FILE_WRITE_ATTEMPTS));
1675 }
1676 }
1677
1678
1679 List<WorkItemHdfsDir> dirs = new ArrayList<WorkItemHdfsDir>(tableDirs.size());
1680 List<Future<Void>> dirsFutures;
1681
1682 for (FileStatus tableDir : tableDirs) {
1683 LOG.debug("Loading region dirs from " +tableDir.getPath());
1684 dirs.add(new WorkItemHdfsDir(this, fs, errors, tableDir));
1685 }
1686
1687
1688 dirsFutures = executor.invokeAll(dirs);
1689
1690 for(Future<Void> f: dirsFutures) {
1691 try {
1692 f.get();
1693 } catch(ExecutionException e) {
1694 LOG.warn("Could not load region dir " , e.getCause());
1695 }
1696 }
1697 errors.print("");
1698 }
1699
1700
1701
1702
1703 private boolean recordMetaRegion() throws IOException {
1704 RegionLocations rl = ((ClusterConnection)connection).locateRegion(TableName.META_TABLE_NAME,
1705 HConstants.EMPTY_START_ROW, false, false);
1706 if (rl == null) {
1707 errors.reportError(ERROR_CODE.NULL_META_REGION,
1708 "META region or some of its attributes are null.");
1709 return false;
1710 }
1711 for (HRegionLocation metaLocation : rl.getRegionLocations()) {
1712
1713 if (metaLocation == null || metaLocation.getRegionInfo() == null ||
1714 metaLocation.getHostname() == null) {
1715 errors.reportError(ERROR_CODE.NULL_META_REGION,
1716 "META region or some of its attributes are null.");
1717 return false;
1718 }
1719 ServerName sn = metaLocation.getServerName();
1720 MetaEntry m = new MetaEntry(metaLocation.getRegionInfo(), sn, System.currentTimeMillis());
1721 HbckInfo hbckInfo = regionInfoMap.get(metaLocation.getRegionInfo().getEncodedName());
1722 if (hbckInfo == null) {
1723 regionInfoMap.put(metaLocation.getRegionInfo().getEncodedName(), new HbckInfo(m));
1724 } else {
1725 hbckInfo.metaEntry = m;
1726 }
1727 }
1728 return true;
1729 }
1730
1731 private ZooKeeperWatcher createZooKeeperWatcher() throws IOException {
1732 return new ZooKeeperWatcher(getConf(), "hbase Fsck", new Abortable() {
1733 @Override
1734 public void abort(String why, Throwable e) {
1735 LOG.error(why, e);
1736 System.exit(1);
1737 }
1738
1739 @Override
1740 public boolean isAborted() {
1741 return false;
1742 }
1743
1744 });
1745 }
1746
1747 private ServerName getMetaRegionServerName(int replicaId)
1748 throws IOException, KeeperException {
1749 ZooKeeperWatcher zkw = createZooKeeperWatcher();
1750 ServerName sn = null;
1751 try {
1752 sn = new MetaTableLocator().getMetaRegionLocation(zkw, replicaId);
1753 } finally {
1754 zkw.close();
1755 }
1756 return sn;
1757 }
1758
1759
1760
1761
1762
1763
1764 void processRegionServers(Collection<ServerName> regionServerList)
1765 throws IOException, InterruptedException {
1766
1767 List<WorkItemRegion> workItems = new ArrayList<WorkItemRegion>(regionServerList.size());
1768 List<Future<Void>> workFutures;
1769
1770
1771 for (ServerName rsinfo: regionServerList) {
1772 workItems.add(new WorkItemRegion(this, rsinfo, errors, connection));
1773 }
1774
1775 workFutures = executor.invokeAll(workItems);
1776
1777 for(int i=0; i<workFutures.size(); i++) {
1778 WorkItemRegion item = workItems.get(i);
1779 Future<Void> f = workFutures.get(i);
1780 try {
1781 f.get();
1782 } catch(ExecutionException e) {
1783 LOG.warn("Could not process regionserver " + item.rsinfo.getHostAndPort(),
1784 e.getCause());
1785 }
1786 }
1787 }
1788
1789
1790
1791
1792 private void checkAndFixConsistency()
1793 throws IOException, KeeperException, InterruptedException {
1794
1795
1796 List<CheckRegionConsistencyWorkItem> workItems =
1797 new ArrayList<CheckRegionConsistencyWorkItem>(regionInfoMap.size());
1798 for (java.util.Map.Entry<String, HbckInfo> e: regionInfoMap.entrySet()) {
1799 if (e.getValue().getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
1800 workItems.add(new CheckRegionConsistencyWorkItem(e.getKey(), e.getValue()));
1801 }
1802 }
1803 checkRegionConsistencyConcurrently(workItems);
1804
1805 boolean prevHdfsCheck = shouldCheckHdfs();
1806 setCheckHdfs(false);
1807
1808
1809 List<CheckRegionConsistencyWorkItem> replicaWorkItems =
1810 new ArrayList<CheckRegionConsistencyWorkItem>(regionInfoMap.size());
1811 for (java.util.Map.Entry<String, HbckInfo> e: regionInfoMap.entrySet()) {
1812 if (e.getValue().getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) {
1813 replicaWorkItems.add(new CheckRegionConsistencyWorkItem(e.getKey(), e.getValue()));
1814 }
1815 }
1816 checkRegionConsistencyConcurrently(replicaWorkItems);
1817 setCheckHdfs(prevHdfsCheck);
1818 }
1819
1820
1821
1822
1823 private void checkRegionConsistencyConcurrently(
1824 final List<CheckRegionConsistencyWorkItem> workItems)
1825 throws IOException, KeeperException, InterruptedException {
1826 if (workItems.isEmpty()) {
1827 return;
1828 }
1829
1830 List<Future<Void>> workFutures = executor.invokeAll(workItems);
1831 for(Future<Void> f: workFutures) {
1832 try {
1833 f.get();
1834 } catch(ExecutionException e1) {
1835 LOG.warn("Could not check region consistency " , e1.getCause());
1836 if (e1.getCause() instanceof IOException) {
1837 throw (IOException)e1.getCause();
1838 } else if (e1.getCause() instanceof KeeperException) {
1839 throw (KeeperException)e1.getCause();
1840 } else if (e1.getCause() instanceof InterruptedException) {
1841 throw (InterruptedException)e1.getCause();
1842 } else {
1843 throw new IOException(e1.getCause());
1844 }
1845 }
1846 }
1847 }
1848
1849 class CheckRegionConsistencyWorkItem implements Callable<Void> {
1850 private final String key;
1851 private final HbckInfo hbi;
1852
1853 CheckRegionConsistencyWorkItem(String key, HbckInfo hbi) {
1854 this.key = key;
1855 this.hbi = hbi;
1856 }
1857
1858 @Override
1859 public synchronized Void call() throws Exception {
1860 checkRegionConsistency(key, hbi);
1861 return null;
1862 }
1863 }
1864
1865 private void preCheckPermission() throws IOException, AccessDeniedException {
1866 if (shouldIgnorePreCheckPermission()) {
1867 return;
1868 }
1869
1870 Path hbaseDir = FSUtils.getRootDir(getConf());
1871 FileSystem fs = hbaseDir.getFileSystem(getConf());
1872 UserProvider userProvider = UserProvider.instantiate(getConf());
1873 UserGroupInformation ugi = userProvider.getCurrent().getUGI();
1874 FileStatus[] files = fs.listStatus(hbaseDir);
1875 for (FileStatus file : files) {
1876 try {
1877 FSUtils.checkAccess(ugi, file, FsAction.WRITE);
1878 } catch (AccessDeniedException ace) {
1879 LOG.warn("Got AccessDeniedException when preCheckPermission ", ace);
1880 errors.reportError(ERROR_CODE.WRONG_USAGE, "Current user " + ugi.getUserName()
1881 + " does not have write perms to " + file.getPath()
1882 + ". Please rerun hbck as hdfs user " + file.getOwner());
1883 throw ace;
1884 }
1885 }
1886 }
1887
1888
1889
1890
1891 private void deleteMetaRegion(HbckInfo hi) throws IOException {
1892 deleteMetaRegion(hi.metaEntry.getRegionName());
1893 }
1894
1895
1896
1897
1898 private void deleteMetaRegion(byte[] metaKey) throws IOException {
1899 Delete d = new Delete(metaKey);
1900 meta.delete(d);
1901 LOG.info("Deleted " + Bytes.toString(metaKey) + " from META" );
1902 }
1903
1904
1905
1906
1907 private void resetSplitParent(HbckInfo hi) throws IOException {
1908 RowMutations mutations = new RowMutations(hi.metaEntry.getRegionName());
1909 Delete d = new Delete(hi.metaEntry.getRegionName());
1910 d.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.SPLITA_QUALIFIER);
1911 d.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.SPLITB_QUALIFIER);
1912 mutations.add(d);
1913
1914 HRegionInfo hri = new HRegionInfo(hi.metaEntry);
1915 hri.setOffline(false);
1916 hri.setSplit(false);
1917 Put p = MetaTableAccessor.makePutFromRegionInfo(hri);
1918 mutations.add(p);
1919
1920 meta.mutateRow(mutations);
1921 LOG.info("Reset split parent " + hi.metaEntry.getRegionNameAsString() + " in META" );
1922 }
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932 private void offline(byte[] regionName) throws IOException {
1933 String regionString = Bytes.toStringBinary(regionName);
1934 if (!rsSupportsOffline) {
1935 LOG.warn("Using unassign region " + regionString
1936 + " instead of using offline method, you should"
1937 + " restart HMaster after these repairs");
1938 admin.unassign(regionName, true);
1939 return;
1940 }
1941
1942
1943 try {
1944 LOG.info("Offlining region " + regionString);
1945 admin.offline(regionName);
1946 } catch (IOException ioe) {
1947 String notFoundMsg = "java.lang.NoSuchMethodException: " +
1948 "org.apache.hadoop.hbase.master.HMaster.offline([B)";
1949 if (ioe.getMessage().contains(notFoundMsg)) {
1950 LOG.warn("Using unassign region " + regionString
1951 + " instead of using offline method, you should"
1952 + " restart HMaster after these repairs");
1953 rsSupportsOffline = false;
1954 admin.unassign(regionName, true);
1955 return;
1956 }
1957 throw ioe;
1958 }
1959 }
1960
1961 private void undeployRegions(HbckInfo hi) throws IOException, InterruptedException {
1962 undeployRegionsForHbi(hi);
1963
1964 if (hi.getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) {
1965 return;
1966 }
1967 int numReplicas = admin.getTableDescriptor(hi.getTableName()).getRegionReplication();
1968 for (int i = 1; i < numReplicas; i++) {
1969 if (hi.getPrimaryHRIForDeployedReplica() == null) continue;
1970 HRegionInfo hri = RegionReplicaUtil.getRegionInfoForReplica(
1971 hi.getPrimaryHRIForDeployedReplica(), i);
1972 HbckInfo h = regionInfoMap.get(hri.getEncodedName());
1973 if (h != null) {
1974 undeployRegionsForHbi(h);
1975
1976
1977 h.setSkipChecks(true);
1978 }
1979 }
1980 }
1981
1982 private void undeployRegionsForHbi(HbckInfo hi) throws IOException, InterruptedException {
1983 for (OnlineEntry rse : hi.deployedEntries) {
1984 LOG.debug("Undeploy region " + rse.hri + " from " + rse.hsa);
1985 try {
1986 HBaseFsckRepair.closeRegionSilentlyAndWait(connection, rse.hsa, rse.hri);
1987 offline(rse.hri.getRegionName());
1988 } catch (IOException ioe) {
1989 LOG.warn("Got exception when attempting to offline region "
1990 + Bytes.toString(rse.hri.getRegionName()), ioe);
1991 }
1992 }
1993 }
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007 private void closeRegion(HbckInfo hi) throws IOException, InterruptedException {
2008 if (hi.metaEntry == null && hi.hdfsEntry == null) {
2009 undeployRegions(hi);
2010 return;
2011 }
2012
2013
2014 Get get = new Get(hi.getRegionName());
2015 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
2016 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
2017 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER);
2018
2019 if (hi.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
2020 int numReplicas = admin.getTableDescriptor(hi.getTableName()).getRegionReplication();
2021 for (int i = 0; i < numReplicas; i++) {
2022 get.addColumn(HConstants.CATALOG_FAMILY, MetaTableAccessor.getServerColumn(i));
2023 get.addColumn(HConstants.CATALOG_FAMILY, MetaTableAccessor.getStartCodeColumn(i));
2024 }
2025 }
2026 Result r = meta.get(get);
2027 RegionLocations rl = MetaTableAccessor.getRegionLocations(r);
2028 if (rl == null) {
2029 LOG.warn("Unable to close region " + hi.getRegionNameAsString() +
2030 " since meta does not have handle to reach it");
2031 return;
2032 }
2033 for (HRegionLocation h : rl.getRegionLocations()) {
2034 ServerName serverName = h.getServerName();
2035 if (serverName == null) {
2036 errors.reportError("Unable to close region "
2037 + hi.getRegionNameAsString() + " because meta does not "
2038 + "have handle to reach it.");
2039 continue;
2040 }
2041 HRegionInfo hri = h.getRegionInfo();
2042 if (hri == null) {
2043 LOG.warn("Unable to close region " + hi.getRegionNameAsString()
2044 + " because hbase:meta had invalid or missing "
2045 + HConstants.CATALOG_FAMILY_STR + ":"
2046 + Bytes.toString(HConstants.REGIONINFO_QUALIFIER)
2047 + " qualifier value.");
2048 continue;
2049 }
2050
2051 HBaseFsckRepair.closeRegionSilentlyAndWait(connection, serverName, hri);
2052 }
2053 }
2054
2055 private void tryAssignmentRepair(HbckInfo hbi, String msg) throws IOException,
2056 KeeperException, InterruptedException {
2057
2058 if (shouldFixAssignments()) {
2059 errors.print(msg);
2060 undeployRegions(hbi);
2061 setShouldRerun();
2062 HRegionInfo hri = hbi.getHdfsHRI();
2063 if (hri == null) {
2064 hri = hbi.metaEntry;
2065 }
2066 HBaseFsckRepair.fixUnassigned(admin, hri);
2067 HBaseFsckRepair.waitUntilAssigned(admin, hri);
2068
2069
2070 if (hbi.getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) return;
2071 int replicationCount = admin.getTableDescriptor(hri.getTable()).getRegionReplication();
2072 for (int i = 1; i < replicationCount; i++) {
2073 hri = RegionReplicaUtil.getRegionInfoForReplica(hri, i);
2074 HbckInfo h = regionInfoMap.get(hri.getEncodedName());
2075 if (h != null) {
2076 undeployRegions(h);
2077
2078
2079 h.setSkipChecks(true);
2080 }
2081 HBaseFsckRepair.fixUnassigned(admin, hri);
2082 HBaseFsckRepair.waitUntilAssigned(admin, hri);
2083 }
2084
2085 }
2086 }
2087
2088
2089
2090
2091 private void checkRegionConsistency(final String key, final HbckInfo hbi)
2092 throws IOException, KeeperException, InterruptedException {
2093
2094 if (hbi.isSkipChecks()) return;
2095 String descriptiveName = hbi.toString();
2096 boolean inMeta = hbi.metaEntry != null;
2097
2098 boolean inHdfs = !shouldCheckHdfs() || hbi.getHdfsRegionDir() != null;
2099 boolean hasMetaAssignment = inMeta && hbi.metaEntry.regionServer != null;
2100 boolean isDeployed = !hbi.deployedOn.isEmpty();
2101 boolean isMultiplyDeployed = hbi.deployedOn.size() > 1;
2102 boolean deploymentMatchesMeta =
2103 hasMetaAssignment && isDeployed && !isMultiplyDeployed &&
2104 hbi.metaEntry.regionServer.equals(hbi.deployedOn.get(0));
2105 boolean splitParent =
2106 (hbi.metaEntry == null)? false: hbi.metaEntry.isSplit() && hbi.metaEntry.isOffline();
2107 boolean shouldBeDeployed = inMeta && !isTableDisabled(hbi.metaEntry);
2108 boolean recentlyModified = inHdfs &&
2109 hbi.getModTime() + timelag > System.currentTimeMillis();
2110
2111
2112 if (hbi.containsOnlyHdfsEdits()) {
2113 return;
2114 }
2115 if (inMeta && inHdfs && isDeployed && deploymentMatchesMeta && shouldBeDeployed) {
2116 return;
2117 } else if (inMeta && inHdfs && !shouldBeDeployed && !isDeployed) {
2118 LOG.info("Region " + descriptiveName + " is in META, and in a disabled " +
2119 "tabled that is not deployed");
2120 return;
2121 } else if (recentlyModified) {
2122 LOG.warn("Region " + descriptiveName + " was recently modified -- skipping");
2123 return;
2124 }
2125
2126 else if (!inMeta && !inHdfs && !isDeployed) {
2127
2128 assert false : "Entry for region with no data";
2129 } else if (!inMeta && !inHdfs && isDeployed) {
2130 errors.reportError(ERROR_CODE.NOT_IN_META_HDFS, "Region "
2131 + descriptiveName + ", key=" + key + ", not on HDFS or in hbase:meta but " +
2132 "deployed on " + Joiner.on(", ").join(hbi.deployedOn));
2133 if (shouldFixAssignments()) {
2134 undeployRegions(hbi);
2135 }
2136
2137 } else if (!inMeta && inHdfs && !isDeployed) {
2138 if (hbi.isMerged()) {
2139
2140
2141 hbi.setSkipChecks(true);
2142 LOG.info("Region " + descriptiveName
2143 + " got merge recently, its file(s) will be cleaned by CatalogJanitor later");
2144 return;
2145 }
2146 errors.reportError(ERROR_CODE.NOT_IN_META_OR_DEPLOYED, "Region "
2147 + descriptiveName + " on HDFS, but not listed in hbase:meta " +
2148 "or deployed on any region server");
2149
2150 if (shouldFixMeta()) {
2151 if (!hbi.isHdfsRegioninfoPresent()) {
2152 LOG.error("Region " + hbi.getHdfsHRI() + " could have been repaired"
2153 + " in table integrity repair phase if -fixHdfsOrphans was" +
2154 " used.");
2155 return;
2156 }
2157
2158 HRegionInfo hri = hbi.getHdfsHRI();
2159 TableInfo tableInfo = tablesInfo.get(hri.getTable());
2160
2161 for (HRegionInfo region : tableInfo.getRegionsFromMeta()) {
2162 if (Bytes.compareTo(region.getStartKey(), hri.getStartKey()) <= 0
2163 && (region.getEndKey().length == 0 || Bytes.compareTo(region.getEndKey(),
2164 hri.getEndKey()) >= 0)
2165 && Bytes.compareTo(region.getStartKey(), hri.getEndKey()) <= 0) {
2166 if(region.isSplit() || region.isOffline()) continue;
2167 Path regionDir = hbi.getHdfsRegionDir();
2168 FileSystem fs = regionDir.getFileSystem(getConf());
2169 List<Path> familyDirs = FSUtils.getFamilyDirs(fs, regionDir);
2170 for (Path familyDir : familyDirs) {
2171 List<Path> referenceFilePaths = FSUtils.getReferenceFilePaths(fs, familyDir);
2172 for (Path referenceFilePath : referenceFilePaths) {
2173 Path parentRegionDir =
2174 StoreFileInfo.getReferredToFile(referenceFilePath).getParent().getParent();
2175 if (parentRegionDir.toString().endsWith(region.getEncodedName())) {
2176 LOG.warn(hri + " start and stop keys are in the range of " + region
2177 + ". The region might not be cleaned up from hdfs when region " + region
2178 + " split failed. Hence deleting from hdfs.");
2179 HRegionFileSystem.deleteRegionFromFileSystem(getConf(), fs,
2180 regionDir.getParent(), hri);
2181 return;
2182 }
2183 }
2184 }
2185 }
2186 }
2187
2188 LOG.info("Patching hbase:meta with .regioninfo: " + hbi.getHdfsHRI());
2189 int numReplicas = admin.getTableDescriptor(hbi.getTableName()).getRegionReplication();
2190 HBaseFsckRepair.fixMetaHoleOnlineAndAddReplicas(getConf(), hbi.getHdfsHRI(),
2191 admin.getClusterStatus().getServers(), numReplicas);
2192
2193 tryAssignmentRepair(hbi, "Trying to reassign region...");
2194 }
2195
2196 } else if (!inMeta && inHdfs && isDeployed) {
2197 errors.reportError(ERROR_CODE.NOT_IN_META, "Region " + descriptiveName
2198 + " not in META, but deployed on " + Joiner.on(", ").join(hbi.deployedOn));
2199 debugLsr(hbi.getHdfsRegionDir());
2200 if (hbi.getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) {
2201
2202
2203
2204
2205 if (shouldFixAssignments()) {
2206 undeployRegionsForHbi(hbi);
2207 }
2208 }
2209 if (shouldFixMeta() && hbi.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
2210 if (!hbi.isHdfsRegioninfoPresent()) {
2211 LOG.error("This should have been repaired in table integrity repair phase");
2212 return;
2213 }
2214
2215 LOG.info("Patching hbase:meta with with .regioninfo: " + hbi.getHdfsHRI());
2216 int numReplicas = admin.getTableDescriptor(hbi.getTableName()).getRegionReplication();
2217 HBaseFsckRepair.fixMetaHoleOnlineAndAddReplicas(getConf(), hbi.getHdfsHRI(),
2218 admin.getClusterStatus().getServers(), numReplicas);
2219 tryAssignmentRepair(hbi, "Trying to fix unassigned region...");
2220 }
2221
2222
2223 } else if (inMeta && inHdfs && !isDeployed && splitParent) {
2224
2225
2226 if (hbi.metaEntry.splitA != null && hbi.metaEntry.splitB != null) {
2227
2228 HbckInfo infoA = this.regionInfoMap.get(hbi.metaEntry.splitA.getEncodedName());
2229 HbckInfo infoB = this.regionInfoMap.get(hbi.metaEntry.splitB.getEncodedName());
2230 if (infoA != null && infoB != null) {
2231
2232 hbi.setSkipChecks(true);
2233 return;
2234 }
2235 }
2236 errors.reportError(ERROR_CODE.LINGERING_SPLIT_PARENT, "Region "
2237 + descriptiveName + " is a split parent in META, in HDFS, "
2238 + "and not deployed on any region server. This could be transient.");
2239 if (shouldFixSplitParents()) {
2240 setShouldRerun();
2241 resetSplitParent(hbi);
2242 }
2243 } else if (inMeta && !inHdfs && !isDeployed) {
2244 errors.reportError(ERROR_CODE.NOT_IN_HDFS_OR_DEPLOYED, "Region "
2245 + descriptiveName + " found in META, but not in HDFS "
2246 + "or deployed on any region server.");
2247 if (shouldFixMeta()) {
2248 deleteMetaRegion(hbi);
2249 }
2250 } else if (inMeta && !inHdfs && isDeployed) {
2251 errors.reportError(ERROR_CODE.NOT_IN_HDFS, "Region " + descriptiveName
2252 + " found in META, but not in HDFS, " +
2253 "and deployed on " + Joiner.on(", ").join(hbi.deployedOn));
2254
2255
2256
2257 if (shouldFixAssignments()) {
2258 errors.print("Trying to fix unassigned region...");
2259 undeployRegions(hbi);
2260 }
2261 if (shouldFixMeta()) {
2262
2263 deleteMetaRegion(hbi);
2264 }
2265 } else if (inMeta && inHdfs && !isDeployed && shouldBeDeployed) {
2266 errors.reportError(ERROR_CODE.NOT_DEPLOYED, "Region " + descriptiveName
2267 + " not deployed on any region server.");
2268 tryAssignmentRepair(hbi, "Trying to fix unassigned region...");
2269 } else if (inMeta && inHdfs && isDeployed && !shouldBeDeployed) {
2270 errors.reportError(ERROR_CODE.SHOULD_NOT_BE_DEPLOYED,
2271 "Region " + descriptiveName + " should not be deployed according " +
2272 "to META, but is deployed on " + Joiner.on(", ").join(hbi.deployedOn));
2273 if (shouldFixAssignments()) {
2274 errors.print("Trying to close the region " + descriptiveName);
2275 setShouldRerun();
2276 HBaseFsckRepair.fixMultiAssignment(connection, hbi.metaEntry, hbi.deployedOn);
2277 }
2278 } else if (inMeta && inHdfs && isMultiplyDeployed) {
2279 errors.reportError(ERROR_CODE.MULTI_DEPLOYED, "Region " + descriptiveName
2280 + " is listed in hbase:meta on region server " + hbi.metaEntry.regionServer
2281 + " but is multiply assigned to region servers " +
2282 Joiner.on(", ").join(hbi.deployedOn));
2283
2284 if (shouldFixAssignments()) {
2285 errors.print("Trying to fix assignment error...");
2286 setShouldRerun();
2287 HBaseFsckRepair.fixMultiAssignment(connection, hbi.metaEntry, hbi.deployedOn);
2288 }
2289 } else if (inMeta && inHdfs && isDeployed && !deploymentMatchesMeta) {
2290 errors.reportError(ERROR_CODE.SERVER_DOES_NOT_MATCH_META, "Region "
2291 + descriptiveName + " listed in hbase:meta on region server " +
2292 hbi.metaEntry.regionServer + " but found on region server " +
2293 hbi.deployedOn.get(0));
2294
2295 if (shouldFixAssignments()) {
2296 errors.print("Trying to fix assignment error...");
2297 setShouldRerun();
2298 HBaseFsckRepair.fixMultiAssignment(connection, hbi.metaEntry, hbi.deployedOn);
2299 HBaseFsckRepair.waitUntilAssigned(admin, hbi.getHdfsHRI());
2300 }
2301 } else {
2302 errors.reportError(ERROR_CODE.UNKNOWN, "Region " + descriptiveName +
2303 " is in an unforeseen state:" +
2304 " inMeta=" + inMeta +
2305 " inHdfs=" + inHdfs +
2306 " isDeployed=" + isDeployed +
2307 " isMultiplyDeployed=" + isMultiplyDeployed +
2308 " deploymentMatchesMeta=" + deploymentMatchesMeta +
2309 " shouldBeDeployed=" + shouldBeDeployed);
2310 }
2311 }
2312
2313
2314
2315
2316
2317
2318
2319 SortedMap<TableName, TableInfo> checkIntegrity() throws IOException {
2320 tablesInfo = new TreeMap<TableName,TableInfo> ();
2321 LOG.debug("There are " + regionInfoMap.size() + " region info entries");
2322 for (HbckInfo hbi : regionInfoMap.values()) {
2323
2324 if (hbi.metaEntry == null) {
2325
2326 Path p = hbi.getHdfsRegionDir();
2327 if (p == null) {
2328 errors.report("No regioninfo in Meta or HDFS. " + hbi);
2329 }
2330
2331
2332 continue;
2333 }
2334 if (hbi.metaEntry.regionServer == null) {
2335 errors.detail("Skipping region because no region server: " + hbi);
2336 continue;
2337 }
2338 if (hbi.metaEntry.isOffline()) {
2339 errors.detail("Skipping region because it is offline: " + hbi);
2340 continue;
2341 }
2342 if (hbi.containsOnlyHdfsEdits()) {
2343 errors.detail("Skipping region because it only contains edits" + hbi);
2344 continue;
2345 }
2346
2347
2348
2349
2350
2351
2352 if (hbi.deployedOn.size() == 0) continue;
2353
2354
2355 TableName tableName = hbi.metaEntry.getTable();
2356 TableInfo modTInfo = tablesInfo.get(tableName);
2357 if (modTInfo == null) {
2358 modTInfo = new TableInfo(tableName);
2359 }
2360 for (ServerName server : hbi.deployedOn) {
2361 modTInfo.addServer(server);
2362 }
2363
2364 if (!hbi.isSkipChecks()) {
2365 modTInfo.addRegionInfo(hbi);
2366 }
2367
2368 tablesInfo.put(tableName, modTInfo);
2369 }
2370
2371 loadTableInfosForTablesWithNoRegion();
2372
2373 logParallelMerge();
2374 for (TableInfo tInfo : tablesInfo.values()) {
2375 TableIntegrityErrorHandler handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
2376 if (!tInfo.checkRegionChain(handler)) {
2377 errors.report("Found inconsistency in table " + tInfo.getName());
2378 }
2379 }
2380 return tablesInfo;
2381 }
2382
2383
2384
2385
2386 private void loadTableInfosForTablesWithNoRegion() throws IOException {
2387 Map<String, HTableDescriptor> allTables = new FSTableDescriptors(getConf()).getAll();
2388 for (HTableDescriptor htd : allTables.values()) {
2389 if (checkMetaOnly && !htd.isMetaTable()) {
2390 continue;
2391 }
2392
2393 TableName tableName = htd.getTableName();
2394 if (isTableIncluded(tableName) && !tablesInfo.containsKey(tableName)) {
2395 TableInfo tableInfo = new TableInfo(tableName);
2396 tableInfo.htds.add(htd);
2397 tablesInfo.put(htd.getTableName(), tableInfo);
2398 }
2399 }
2400 }
2401
2402
2403
2404
2405
2406 public int mergeRegionDirs(Path targetRegionDir, HbckInfo contained) throws IOException {
2407 int fileMoves = 0;
2408 String thread = Thread.currentThread().getName();
2409 LOG.debug("[" + thread + "] Contained region dir after close and pause");
2410 debugLsr(contained.getHdfsRegionDir());
2411
2412
2413 FileSystem fs = targetRegionDir.getFileSystem(getConf());
2414 FileStatus[] dirs = null;
2415 try {
2416 dirs = fs.listStatus(contained.getHdfsRegionDir());
2417 } catch (FileNotFoundException fnfe) {
2418
2419
2420 if (!fs.exists(contained.getHdfsRegionDir())) {
2421 LOG.warn("[" + thread + "] HDFS region dir " + contained.getHdfsRegionDir()
2422 + " is missing. Assuming already sidelined or moved.");
2423 } else {
2424 sidelineRegionDir(fs, contained);
2425 }
2426 return fileMoves;
2427 }
2428
2429 if (dirs == null) {
2430 if (!fs.exists(contained.getHdfsRegionDir())) {
2431 LOG.warn("[" + thread + "] HDFS region dir " + contained.getHdfsRegionDir()
2432 + " already sidelined.");
2433 } else {
2434 sidelineRegionDir(fs, contained);
2435 }
2436 return fileMoves;
2437 }
2438
2439 for (FileStatus cf : dirs) {
2440 Path src = cf.getPath();
2441 Path dst = new Path(targetRegionDir, src.getName());
2442
2443 if (src.getName().equals(HRegionFileSystem.REGION_INFO_FILE)) {
2444
2445 continue;
2446 }
2447
2448 if (src.getName().equals(HConstants.HREGION_OLDLOGDIR_NAME)) {
2449
2450 continue;
2451 }
2452
2453 LOG.info("[" + thread + "] Moving files from " + src + " into containing region " + dst);
2454
2455
2456
2457
2458 for (FileStatus hfile : fs.listStatus(src)) {
2459 boolean success = fs.rename(hfile.getPath(), dst);
2460 if (success) {
2461 fileMoves++;
2462 }
2463 }
2464 LOG.debug("[" + thread + "] Sideline directory contents:");
2465 debugLsr(targetRegionDir);
2466 }
2467
2468
2469 sidelineRegionDir(fs, contained);
2470 LOG.info("[" + thread + "] Sidelined region dir "+ contained.getHdfsRegionDir() + " into " +
2471 getSidelineDir());
2472 debugLsr(contained.getHdfsRegionDir());
2473
2474 return fileMoves;
2475 }
2476
2477
2478 static class WorkItemOverlapMerge implements Callable<Void> {
2479 private TableIntegrityErrorHandler handler;
2480 Collection<HbckInfo> overlapgroup;
2481
2482 WorkItemOverlapMerge(Collection<HbckInfo> overlapgroup, TableIntegrityErrorHandler handler) {
2483 this.handler = handler;
2484 this.overlapgroup = overlapgroup;
2485 }
2486
2487 @Override
2488 public Void call() throws Exception {
2489 handler.handleOverlapGroup(overlapgroup);
2490 return null;
2491 }
2492 };
2493
2494
2495
2496
2497
2498 public class TableInfo {
2499 TableName tableName;
2500 TreeSet <ServerName> deployedOn;
2501
2502
2503 final List<HbckInfo> backwards = new ArrayList<HbckInfo>();
2504
2505
2506 final Map<Path, HbckInfo> sidelinedRegions = new HashMap<Path, HbckInfo>();
2507
2508
2509 final RegionSplitCalculator<HbckInfo> sc = new RegionSplitCalculator<HbckInfo>(cmp);
2510
2511
2512 final Set<HTableDescriptor> htds = new HashSet<HTableDescriptor>();
2513
2514
2515 final Multimap<byte[], HbckInfo> overlapGroups =
2516 TreeMultimap.create(RegionSplitCalculator.BYTES_COMPARATOR, cmp);
2517
2518
2519 private ImmutableList<HRegionInfo> regionsFromMeta = null;
2520
2521 TableInfo(TableName name) {
2522 this.tableName = name;
2523 deployedOn = new TreeSet <ServerName>();
2524 }
2525
2526
2527
2528
2529 private HTableDescriptor getHTD() {
2530 if (htds.size() == 1) {
2531 return (HTableDescriptor)htds.toArray()[0];
2532 } else {
2533 LOG.error("None/Multiple table descriptors found for table '"
2534 + tableName + "' regions: " + htds);
2535 }
2536 return null;
2537 }
2538
2539 public void addRegionInfo(HbckInfo hir) {
2540 if (Bytes.equals(hir.getEndKey(), HConstants.EMPTY_END_ROW)) {
2541
2542
2543 if (hir.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) sc.add(hir);
2544 return;
2545 }
2546
2547
2548 if (Bytes.compareTo(hir.getStartKey(), hir.getEndKey()) > 0) {
2549 errors.reportError(
2550 ERROR_CODE.REGION_CYCLE,
2551 String.format("The endkey for this region comes before the "
2552 + "startkey, startkey=%s, endkey=%s",
2553 Bytes.toStringBinary(hir.getStartKey()),
2554 Bytes.toStringBinary(hir.getEndKey())), this, hir);
2555 backwards.add(hir);
2556 return;
2557 }
2558
2559
2560
2561 if (hir.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) sc.add(hir);
2562 }
2563
2564 public void addServer(ServerName server) {
2565 this.deployedOn.add(server);
2566 }
2567
2568 public TableName getName() {
2569 return tableName;
2570 }
2571
2572 public int getNumRegions() {
2573 return sc.getStarts().size() + backwards.size();
2574 }
2575
2576 public synchronized ImmutableList<HRegionInfo> getRegionsFromMeta() {
2577
2578 if (regionsFromMeta == null) {
2579 List<HRegionInfo> regions = new ArrayList<HRegionInfo>();
2580 for (HbckInfo h : HBaseFsck.this.regionInfoMap.values()) {
2581 if (tableName.equals(h.getTableName())) {
2582 if (h.metaEntry != null) {
2583 regions.add((HRegionInfo) h.metaEntry);
2584 }
2585 }
2586 }
2587 regionsFromMeta = Ordering.natural().immutableSortedCopy(regions);
2588 }
2589
2590 return regionsFromMeta;
2591 }
2592
2593
2594 private class IntegrityFixSuggester extends TableIntegrityErrorHandlerImpl {
2595 ErrorReporter errors;
2596
2597 IntegrityFixSuggester(TableInfo ti, ErrorReporter errors) {
2598 this.errors = errors;
2599 setTableInfo(ti);
2600 }
2601
2602 @Override
2603 public void handleRegionStartKeyNotEmpty(HbckInfo hi) throws IOException{
2604 errors.reportError(ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY,
2605 "First region should start with an empty key. You need to "
2606 + " create a new region and regioninfo in HDFS to plug the hole.",
2607 getTableInfo(), hi);
2608 }
2609
2610 @Override
2611 public void handleRegionEndKeyNotEmpty(byte[] curEndKey) throws IOException {
2612 errors.reportError(ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY,
2613 "Last region should end with an empty key. You need to "
2614 + "create a new region and regioninfo in HDFS to plug the hole.", getTableInfo());
2615 }
2616
2617 @Override
2618 public void handleDegenerateRegion(HbckInfo hi) throws IOException{
2619 errors.reportError(ERROR_CODE.DEGENERATE_REGION,
2620 "Region has the same start and end key.", getTableInfo(), hi);
2621 }
2622
2623 @Override
2624 public void handleDuplicateStartKeys(HbckInfo r1, HbckInfo r2) throws IOException{
2625 byte[] key = r1.getStartKey();
2626
2627 errors.reportError(ERROR_CODE.DUPE_STARTKEYS,
2628 "Multiple regions have the same startkey: "
2629 + Bytes.toStringBinary(key), getTableInfo(), r1);
2630 errors.reportError(ERROR_CODE.DUPE_STARTKEYS,
2631 "Multiple regions have the same startkey: "
2632 + Bytes.toStringBinary(key), getTableInfo(), r2);
2633 }
2634
2635 @Override
2636 public void handleOverlapInRegionChain(HbckInfo hi1, HbckInfo hi2) throws IOException{
2637 errors.reportError(ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
2638 "There is an overlap in the region chain.",
2639 getTableInfo(), hi1, hi2);
2640 }
2641
2642 @Override
2643 public void handleHoleInRegionChain(byte[] holeStart, byte[] holeStop) throws IOException{
2644 errors.reportError(
2645 ERROR_CODE.HOLE_IN_REGION_CHAIN,
2646 "There is a hole in the region chain between "
2647 + Bytes.toStringBinary(holeStart) + " and "
2648 + Bytes.toStringBinary(holeStop)
2649 + ". You need to create a new .regioninfo and region "
2650 + "dir in hdfs to plug the hole.");
2651 }
2652 };
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666 private class HDFSIntegrityFixer extends IntegrityFixSuggester {
2667 Configuration conf;
2668
2669 boolean fixOverlaps = true;
2670
2671 HDFSIntegrityFixer(TableInfo ti, ErrorReporter errors, Configuration conf,
2672 boolean fixHoles, boolean fixOverlaps) {
2673 super(ti, errors);
2674 this.conf = conf;
2675 this.fixOverlaps = fixOverlaps;
2676
2677 }
2678
2679
2680
2681
2682
2683
2684 @Override
2685 public void handleRegionStartKeyNotEmpty(HbckInfo next) throws IOException {
2686 errors.reportError(ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY,
2687 "First region should start with an empty key. Creating a new " +
2688 "region and regioninfo in HDFS to plug the hole.",
2689 getTableInfo(), next);
2690 HTableDescriptor htd = getTableInfo().getHTD();
2691
2692 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(),
2693 HConstants.EMPTY_START_ROW, next.getStartKey());
2694
2695
2696 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2697 LOG.info("Table region start key was not empty. Created new empty region: "
2698 + newRegion + " " +region);
2699 fixes++;
2700 }
2701
2702 @Override
2703 public void handleRegionEndKeyNotEmpty(byte[] curEndKey) throws IOException {
2704 errors.reportError(ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY,
2705 "Last region should end with an empty key. Creating a new "
2706 + "region and regioninfo in HDFS to plug the hole.", getTableInfo());
2707 HTableDescriptor htd = getTableInfo().getHTD();
2708
2709 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(), curEndKey,
2710 HConstants.EMPTY_START_ROW);
2711
2712 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2713 LOG.info("Table region end key was not empty. Created new empty region: " + newRegion
2714 + " " + region);
2715 fixes++;
2716 }
2717
2718
2719
2720
2721
2722 @Override
2723 public void handleHoleInRegionChain(byte[] holeStartKey, byte[] holeStopKey) throws IOException {
2724 errors.reportError(
2725 ERROR_CODE.HOLE_IN_REGION_CHAIN,
2726 "There is a hole in the region chain between "
2727 + Bytes.toStringBinary(holeStartKey) + " and "
2728 + Bytes.toStringBinary(holeStopKey)
2729 + ". Creating a new regioninfo and region "
2730 + "dir in hdfs to plug the hole.");
2731 HTableDescriptor htd = getTableInfo().getHTD();
2732 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(), holeStartKey, holeStopKey);
2733 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2734 LOG.info("Plugged hole by creating new empty region: "+ newRegion + " " +region);
2735 fixes++;
2736 }
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749 @Override
2750 public void handleOverlapGroup(Collection<HbckInfo> overlap)
2751 throws IOException {
2752 Preconditions.checkNotNull(overlap);
2753 Preconditions.checkArgument(overlap.size() >0);
2754
2755 if (!this.fixOverlaps) {
2756 LOG.warn("Not attempting to repair overlaps.");
2757 return;
2758 }
2759
2760 if (overlap.size() > maxMerge) {
2761 LOG.warn("Overlap group has " + overlap.size() + " overlapping " +
2762 "regions which is greater than " + maxMerge + ", the max number of regions to merge");
2763 if (sidelineBigOverlaps) {
2764
2765 sidelineBigOverlaps(overlap);
2766 }
2767 return;
2768 }
2769
2770 mergeOverlaps(overlap);
2771 }
2772
2773 void mergeOverlaps(Collection<HbckInfo> overlap)
2774 throws IOException {
2775 String thread = Thread.currentThread().getName();
2776 LOG.info("== [" + thread + "] Merging regions into one region: "
2777 + Joiner.on(",").join(overlap));
2778
2779 Pair<byte[], byte[]> range = null;
2780 for (HbckInfo hi : overlap) {
2781 if (range == null) {
2782 range = new Pair<byte[], byte[]>(hi.getStartKey(), hi.getEndKey());
2783 } else {
2784 if (RegionSplitCalculator.BYTES_COMPARATOR
2785 .compare(hi.getStartKey(), range.getFirst()) < 0) {
2786 range.setFirst(hi.getStartKey());
2787 }
2788 if (RegionSplitCalculator.BYTES_COMPARATOR
2789 .compare(hi.getEndKey(), range.getSecond()) > 0) {
2790 range.setSecond(hi.getEndKey());
2791 }
2792 }
2793
2794 LOG.debug("[" + thread + "] Closing region before moving data around: " + hi);
2795 LOG.debug("[" + thread + "] Contained region dir before close");
2796 debugLsr(hi.getHdfsRegionDir());
2797 try {
2798 LOG.info("[" + thread + "] Closing region: " + hi);
2799 closeRegion(hi);
2800 } catch (IOException ioe) {
2801 LOG.warn("[" + thread + "] Was unable to close region " + hi
2802 + ". Just continuing... ", ioe);
2803 } catch (InterruptedException e) {
2804 LOG.warn("[" + thread + "] Was unable to close region " + hi
2805 + ". Just continuing... ", e);
2806 }
2807
2808 try {
2809 LOG.info("[" + thread + "] Offlining region: " + hi);
2810 offline(hi.getRegionName());
2811 } catch (IOException ioe) {
2812 LOG.warn("[" + thread + "] Unable to offline region from master: " + hi
2813 + ". Just continuing... ", ioe);
2814 }
2815 }
2816
2817
2818 HTableDescriptor htd = getTableInfo().getHTD();
2819
2820 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(), range.getFirst(),
2821 range.getSecond());
2822 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2823 LOG.info("[" + thread + "] Created new empty container region: " +
2824 newRegion + " to contain regions: " + Joiner.on(",").join(overlap));
2825 debugLsr(region.getRegionFileSystem().getRegionDir());
2826
2827
2828 boolean didFix= false;
2829 Path target = region.getRegionFileSystem().getRegionDir();
2830 for (HbckInfo contained : overlap) {
2831 LOG.info("[" + thread + "] Merging " + contained + " into " + target );
2832 int merges = mergeRegionDirs(target, contained);
2833 if (merges > 0) {
2834 didFix = true;
2835 }
2836 }
2837 if (didFix) {
2838 fixes++;
2839 }
2840 }
2841
2842
2843
2844
2845
2846
2847
2848
2849 void sidelineBigOverlaps(
2850 Collection<HbckInfo> bigOverlap) throws IOException {
2851 int overlapsToSideline = bigOverlap.size() - maxMerge;
2852 if (overlapsToSideline > maxOverlapsToSideline) {
2853 overlapsToSideline = maxOverlapsToSideline;
2854 }
2855 List<HbckInfo> regionsToSideline =
2856 RegionSplitCalculator.findBigRanges(bigOverlap, overlapsToSideline);
2857 FileSystem fs = FileSystem.get(conf);
2858 for (HbckInfo regionToSideline: regionsToSideline) {
2859 try {
2860 LOG.info("Closing region: " + regionToSideline);
2861 closeRegion(regionToSideline);
2862 } catch (IOException ioe) {
2863 LOG.warn("Was unable to close region " + regionToSideline
2864 + ". Just continuing... ", ioe);
2865 } catch (InterruptedException e) {
2866 LOG.warn("Was unable to close region " + regionToSideline
2867 + ". Just continuing... ", e);
2868 }
2869
2870 try {
2871 LOG.info("Offlining region: " + regionToSideline);
2872 offline(regionToSideline.getRegionName());
2873 } catch (IOException ioe) {
2874 LOG.warn("Unable to offline region from master: " + regionToSideline
2875 + ". Just continuing... ", ioe);
2876 }
2877
2878 LOG.info("Before sideline big overlapped region: " + regionToSideline.toString());
2879 Path sidelineRegionDir = sidelineRegionDir(fs, TO_BE_LOADED, regionToSideline);
2880 if (sidelineRegionDir != null) {
2881 sidelinedRegions.put(sidelineRegionDir, regionToSideline);
2882 LOG.info("After sidelined big overlapped region: "
2883 + regionToSideline.getRegionNameAsString()
2884 + " to " + sidelineRegionDir.toString());
2885 fixes++;
2886 }
2887 }
2888 }
2889 }
2890
2891
2892
2893
2894
2895
2896
2897 public boolean checkRegionChain(TableIntegrityErrorHandler handler) throws IOException {
2898
2899
2900
2901 if (disabledTables.contains(this.tableName)) {
2902 return true;
2903 }
2904 int originalErrorsCount = errors.getErrorList().size();
2905 Multimap<byte[], HbckInfo> regions = sc.calcCoverage();
2906 SortedSet<byte[]> splits = sc.getSplits();
2907
2908 byte[] prevKey = null;
2909 byte[] problemKey = null;
2910
2911 if (splits.size() == 0) {
2912
2913 handler.handleHoleInRegionChain(HConstants.EMPTY_START_ROW, HConstants.EMPTY_END_ROW);
2914 }
2915
2916 for (byte[] key : splits) {
2917 Collection<HbckInfo> ranges = regions.get(key);
2918 if (prevKey == null && !Bytes.equals(key, HConstants.EMPTY_BYTE_ARRAY)) {
2919 for (HbckInfo rng : ranges) {
2920 handler.handleRegionStartKeyNotEmpty(rng);
2921 }
2922 }
2923
2924
2925 for (HbckInfo rng : ranges) {
2926
2927 byte[] endKey = rng.getEndKey();
2928 endKey = (endKey.length == 0) ? null : endKey;
2929 if (Bytes.equals(rng.getStartKey(),endKey)) {
2930 handler.handleDegenerateRegion(rng);
2931 }
2932 }
2933
2934 if (ranges.size() == 1) {
2935
2936 if (problemKey != null) {
2937 LOG.warn("reached end of problem group: " + Bytes.toStringBinary(key));
2938 }
2939 problemKey = null;
2940 } else if (ranges.size() > 1) {
2941
2942
2943 if (problemKey == null) {
2944
2945 LOG.warn("Naming new problem group: " + Bytes.toStringBinary(key));
2946 problemKey = key;
2947 }
2948 overlapGroups.putAll(problemKey, ranges);
2949
2950
2951 ArrayList<HbckInfo> subRange = new ArrayList<HbckInfo>(ranges);
2952
2953 for (HbckInfo r1 : ranges) {
2954 if (r1.getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) continue;
2955 subRange.remove(r1);
2956 for (HbckInfo r2 : subRange) {
2957 if (r2.getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) continue;
2958 if (Bytes.compareTo(r1.getStartKey(), r2.getStartKey())==0) {
2959 handler.handleDuplicateStartKeys(r1,r2);
2960 } else {
2961
2962 handler.handleOverlapInRegionChain(r1, r2);
2963 }
2964 }
2965 }
2966
2967 } else if (ranges.size() == 0) {
2968 if (problemKey != null) {
2969 LOG.warn("reached end of problem group: " + Bytes.toStringBinary(key));
2970 }
2971 problemKey = null;
2972
2973 byte[] holeStopKey = sc.getSplits().higher(key);
2974
2975 if (holeStopKey != null) {
2976
2977 handler.handleHoleInRegionChain(key, holeStopKey);
2978 }
2979 }
2980 prevKey = key;
2981 }
2982
2983
2984
2985 if (prevKey != null) {
2986 handler.handleRegionEndKeyNotEmpty(prevKey);
2987 }
2988
2989
2990 if (getConf().getBoolean("hbasefsck.overlap.merge.parallel", true)) {
2991 boolean ok = handleOverlapsParallel(handler, prevKey);
2992 if (!ok) {
2993 return false;
2994 }
2995 } else {
2996 for (Collection<HbckInfo> overlap : overlapGroups.asMap().values()) {
2997 handler.handleOverlapGroup(overlap);
2998 }
2999 }
3000
3001 if (details) {
3002
3003 errors.print("---- Table '" + this.tableName
3004 + "': region split map");
3005 dump(splits, regions);
3006 errors.print("---- Table '" + this.tableName
3007 + "': overlap groups");
3008 dumpOverlapProblems(overlapGroups);
3009 errors.print("There are " + overlapGroups.keySet().size()
3010 + " overlap groups with " + overlapGroups.size()
3011 + " overlapping regions");
3012 }
3013 if (!sidelinedRegions.isEmpty()) {
3014 LOG.warn("Sidelined big overlapped regions, please bulk load them!");
3015 errors.print("---- Table '" + this.tableName
3016 + "': sidelined big overlapped regions");
3017 dumpSidelinedRegions(sidelinedRegions);
3018 }
3019 return errors.getErrorList().size() == originalErrorsCount;
3020 }
3021
3022 private boolean handleOverlapsParallel(TableIntegrityErrorHandler handler, byte[] prevKey)
3023 throws IOException {
3024
3025
3026 List<WorkItemOverlapMerge> merges = new ArrayList<WorkItemOverlapMerge>(overlapGroups.size());
3027 List<Future<Void>> rets;
3028 for (Collection<HbckInfo> overlap : overlapGroups.asMap().values()) {
3029
3030 merges.add(new WorkItemOverlapMerge(overlap, handler));
3031 }
3032 try {
3033 rets = executor.invokeAll(merges);
3034 } catch (InterruptedException e) {
3035 LOG.error("Overlap merges were interrupted", e);
3036 return false;
3037 }
3038 for(int i=0; i<merges.size(); i++) {
3039 WorkItemOverlapMerge work = merges.get(i);
3040 Future<Void> f = rets.get(i);
3041 try {
3042 f.get();
3043 } catch(ExecutionException e) {
3044 LOG.warn("Failed to merge overlap group" + work, e.getCause());
3045 } catch (InterruptedException e) {
3046 LOG.error("Waiting for overlap merges was interrupted", e);
3047 return false;
3048 }
3049 }
3050 return true;
3051 }
3052
3053
3054
3055
3056
3057
3058
3059 void dump(SortedSet<byte[]> splits, Multimap<byte[], HbckInfo> regions) {
3060
3061 StringBuilder sb = new StringBuilder();
3062 for (byte[] k : splits) {
3063 sb.setLength(0);
3064 sb.append(Bytes.toStringBinary(k) + ":\t");
3065 for (HbckInfo r : regions.get(k)) {
3066 sb.append("[ "+ r.toString() + ", "
3067 + Bytes.toStringBinary(r.getEndKey())+ "]\t");
3068 }
3069 errors.print(sb.toString());
3070 }
3071 }
3072 }
3073
3074 public void dumpOverlapProblems(Multimap<byte[], HbckInfo> regions) {
3075
3076
3077 for (byte[] k : regions.keySet()) {
3078 errors.print(Bytes.toStringBinary(k) + ":");
3079 for (HbckInfo r : regions.get(k)) {
3080 errors.print("[ " + r.toString() + ", "
3081 + Bytes.toStringBinary(r.getEndKey()) + "]");
3082 }
3083 errors.print("----");
3084 }
3085 }
3086
3087 public void dumpSidelinedRegions(Map<Path, HbckInfo> regions) {
3088 for (Map.Entry<Path, HbckInfo> entry: regions.entrySet()) {
3089 TableName tableName = entry.getValue().getTableName();
3090 Path path = entry.getKey();
3091 errors.print("This sidelined region dir should be bulk loaded: "
3092 + path.toString());
3093 errors.print("Bulk load command looks like: "
3094 + "hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles "
3095 + path.toUri().getPath() + " "+ tableName);
3096 }
3097 }
3098
3099 public Multimap<byte[], HbckInfo> getOverlapGroups(
3100 TableName table) {
3101 TableInfo ti = tablesInfo.get(table);
3102 return ti.overlapGroups;
3103 }
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114 HTableDescriptor[] getTables(AtomicInteger numSkipped) {
3115 List<TableName> tableNames = new ArrayList<TableName>();
3116 long now = System.currentTimeMillis();
3117
3118 for (HbckInfo hbi : regionInfoMap.values()) {
3119 MetaEntry info = hbi.metaEntry;
3120
3121
3122
3123 if (info != null && info.getStartKey().length == 0 && !info.isMetaRegion()) {
3124 if (info.modTime + timelag < now) {
3125 tableNames.add(info.getTable());
3126 } else {
3127 numSkipped.incrementAndGet();
3128 }
3129 }
3130 }
3131 return getHTableDescriptors(tableNames);
3132 }
3133
3134 HTableDescriptor[] getHTableDescriptors(List<TableName> tableNames) {
3135 HTableDescriptor[] htd = new HTableDescriptor[0];
3136 Admin admin = null;
3137 try {
3138 LOG.info("getHTableDescriptors == tableNames => " + tableNames);
3139 admin = new HBaseAdmin(getConf());
3140 htd = admin.getTableDescriptorsByTableName(tableNames);
3141 } catch (IOException e) {
3142 LOG.debug("Exception getting table descriptors", e);
3143 } finally {
3144 if (admin != null) {
3145 try {
3146 admin.close();
3147 } catch (IOException e) {
3148 LOG.debug("Exception closing HBaseAdmin", e);
3149 }
3150 }
3151 }
3152 return htd;
3153 }
3154
3155
3156
3157
3158
3159
3160 private synchronized HbckInfo getOrCreateInfo(String name) {
3161 HbckInfo hbi = regionInfoMap.get(name);
3162 if (hbi == null) {
3163 hbi = new HbckInfo(null);
3164 regionInfoMap.put(name, hbi);
3165 }
3166 return hbi;
3167 }
3168
3169 private void checkAndFixTableLocks() throws IOException {
3170 ZooKeeperWatcher zkw = createZooKeeperWatcher();
3171
3172 try {
3173 TableLockChecker checker = new TableLockChecker(zkw, errors);
3174 checker.checkTableLocks();
3175
3176 if (this.fixTableLocks) {
3177 checker.fixExpiredTableLocks();
3178 }
3179 } finally {
3180 zkw.close();
3181 }
3182 }
3183
3184
3185
3186
3187
3188
3189
3190 private void checkAndFixOrphanedTableZNodes()
3191 throws IOException, KeeperException, InterruptedException {
3192 ZooKeeperWatcher zkw = createZooKeeperWatcher();
3193
3194 try {
3195 Set<TableName> enablingTables = ZKTableStateClientSideReader.getEnablingTables(zkw);
3196 String msg;
3197 TableInfo tableInfo;
3198
3199 for (TableName tableName : enablingTables) {
3200
3201 tableInfo = tablesInfo.get(tableName);
3202 if (tableInfo != null) {
3203
3204 continue;
3205 }
3206
3207 msg = "Table " + tableName + " not found in hbase:meta. Orphaned table ZNode found.";
3208 LOG.warn(msg);
3209 orphanedTableZNodes.add(tableName);
3210 errors.reportError(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY, msg);
3211 }
3212
3213 if (orphanedTableZNodes.size() > 0 && this.fixTableZNodes) {
3214 ZKTableStateManager zkTableStateMgr = new ZKTableStateManager(zkw);
3215
3216 for (TableName tableName : orphanedTableZNodes) {
3217 try {
3218
3219
3220
3221
3222 zkTableStateMgr.setTableState(tableName, ZooKeeperProtos.Table.State.DISABLED);
3223 } catch (CoordinatedStateException e) {
3224
3225 LOG.error(
3226 "Got a CoordinatedStateException while fixing the ENABLING table znode " + tableName,
3227 e);
3228 }
3229 }
3230 }
3231 } finally {
3232 zkw.close();
3233 }
3234 }
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245 boolean checkMetaRegion() throws IOException, KeeperException, InterruptedException {
3246 Map<Integer, HbckInfo> metaRegions = new HashMap<Integer, HbckInfo>();
3247 for (HbckInfo value : regionInfoMap.values()) {
3248 if (value.metaEntry != null && value.metaEntry.isMetaRegion()) {
3249 metaRegions.put(value.getReplicaId(), value);
3250 }
3251 }
3252 int metaReplication = admin.getTableDescriptor(TableName.META_TABLE_NAME)
3253 .getRegionReplication();
3254 boolean noProblem = true;
3255
3256
3257 for (int i = 0; i < metaReplication; i++) {
3258 HbckInfo metaHbckInfo = metaRegions.remove(i);
3259 List<ServerName> servers = new ArrayList<ServerName>();
3260 if (metaHbckInfo != null) {
3261 servers = metaHbckInfo.deployedOn;
3262 }
3263 if (servers.size() != 1) {
3264 noProblem = false;
3265 if (servers.size() == 0) {
3266 assignMetaReplica(i);
3267 } else if (servers.size() > 1) {
3268 errors
3269 .reportError(ERROR_CODE.MULTI_META_REGION, "hbase:meta, replicaId " +
3270 metaHbckInfo.getReplicaId() + " is found on more than one region.");
3271 if (shouldFixAssignments()) {
3272 errors.print("Trying to fix a problem with hbase:meta, replicaId " +
3273 metaHbckInfo.getReplicaId() +"..");
3274 setShouldRerun();
3275
3276 HBaseFsckRepair.fixMultiAssignment(connection, metaHbckInfo.metaEntry, servers);
3277 }
3278 }
3279 }
3280 }
3281
3282 for (Map.Entry<Integer, HbckInfo> entry : metaRegions.entrySet()) {
3283 noProblem = false;
3284 errors.reportError(ERROR_CODE.SHOULD_NOT_BE_DEPLOYED,
3285 "hbase:meta replicas are deployed in excess. Configured " + metaReplication +
3286 ", deployed " + metaRegions.size());
3287 if (shouldFixAssignments()) {
3288 errors.print("Trying to undeploy excess replica, replicaId: " + entry.getKey() +
3289 " of hbase:meta..");
3290 setShouldRerun();
3291 unassignMetaReplica(entry.getValue());
3292 }
3293 }
3294
3295
3296 return noProblem;
3297 }
3298
3299 private void unassignMetaReplica(HbckInfo hi) throws IOException, InterruptedException,
3300 KeeperException {
3301 undeployRegions(hi);
3302 ZooKeeperWatcher zkw = createZooKeeperWatcher();
3303 ZKUtil.deleteNode(zkw, zkw.getZNodeForReplica(hi.metaEntry.getReplicaId()));
3304 }
3305
3306 private void assignMetaReplica(int replicaId)
3307 throws IOException, KeeperException, InterruptedException {
3308 errors.reportError(ERROR_CODE.NO_META_REGION, "hbase:meta, replicaId " +
3309 replicaId +" is not found on any region.");
3310 if (shouldFixAssignments()) {
3311 errors.print("Trying to fix a problem with hbase:meta..");
3312 setShouldRerun();
3313
3314 HRegionInfo h = RegionReplicaUtil.getRegionInfoForReplica(
3315 HRegionInfo.FIRST_META_REGIONINFO, replicaId);
3316 HBaseFsckRepair.fixUnassigned(admin, h);
3317 HBaseFsckRepair.waitUntilAssigned(admin, h);
3318 }
3319 }
3320
3321
3322
3323
3324
3325 boolean loadMetaEntries() throws IOException {
3326 MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
3327 int countRecord = 1;
3328
3329
3330 final Comparator<Cell> comp = new Comparator<Cell>() {
3331 @Override
3332 public int compare(Cell k1, Cell k2) {
3333 return (int)(k1.getTimestamp() - k2.getTimestamp());
3334 }
3335 };
3336
3337 @Override
3338 public boolean processRow(Result result) throws IOException {
3339 try {
3340
3341
3342 long ts = Collections.max(result.listCells(), comp).getTimestamp();
3343 RegionLocations rl = MetaTableAccessor.getRegionLocations(result);
3344 if (rl == null) {
3345 emptyRegionInfoQualifiers.add(result);
3346 errors.reportError(ERROR_CODE.EMPTY_META_CELL,
3347 "Empty REGIONINFO_QUALIFIER found in hbase:meta");
3348 return true;
3349 }
3350 ServerName sn = null;
3351 if (rl.getRegionLocation(HRegionInfo.DEFAULT_REPLICA_ID) == null ||
3352 rl.getRegionLocation(HRegionInfo.DEFAULT_REPLICA_ID).getRegionInfo() == null) {
3353 emptyRegionInfoQualifiers.add(result);
3354 errors.reportError(ERROR_CODE.EMPTY_META_CELL,
3355 "Empty REGIONINFO_QUALIFIER found in hbase:meta");
3356 return true;
3357 }
3358 HRegionInfo hri = rl.getRegionLocation(HRegionInfo.DEFAULT_REPLICA_ID).getRegionInfo();
3359 if (!(isTableIncluded(hri.getTable())
3360 || hri.isMetaRegion())) {
3361 return true;
3362 }
3363 PairOfSameType<HRegionInfo> daughters = HRegionInfo.getDaughterRegions(result);
3364 for (HRegionLocation h : rl.getRegionLocations()) {
3365 if (h == null || h.getRegionInfo() == null) {
3366 continue;
3367 }
3368 sn = h.getServerName();
3369 hri = h.getRegionInfo();
3370
3371 MetaEntry m = null;
3372 if (hri.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
3373 m = new MetaEntry(hri, sn, ts, daughters.getFirst(), daughters.getSecond());
3374 } else {
3375 m = new MetaEntry(hri, sn, ts, null, null);
3376 }
3377 HbckInfo previous = regionInfoMap.get(hri.getEncodedName());
3378 if (previous == null) {
3379 regionInfoMap.put(hri.getEncodedName(), new HbckInfo(m));
3380 } else if (previous.metaEntry == null) {
3381 previous.metaEntry = m;
3382 } else {
3383 throw new IOException("Two entries in hbase:meta are same " + previous);
3384 }
3385 }
3386 PairOfSameType<HRegionInfo> mergeRegions = HRegionInfo.getMergeRegions(result);
3387 for (HRegionInfo mergeRegion : new HRegionInfo[] {
3388 mergeRegions.getFirst(), mergeRegions.getSecond() }) {
3389 if (mergeRegion != null) {
3390
3391 HbckInfo hbInfo = getOrCreateInfo(mergeRegion.getEncodedName());
3392 hbInfo.setMerged(true);
3393 }
3394 }
3395
3396
3397 if (countRecord % 100 == 0) {
3398 errors.progress();
3399 }
3400 countRecord++;
3401 return true;
3402 } catch (RuntimeException e) {
3403 LOG.error("Result=" + result);
3404 throw e;
3405 }
3406 }
3407 };
3408 if (!checkMetaOnly) {
3409
3410 MetaScanner.metaScan(connection, visitor);
3411 }
3412
3413 errors.print("");
3414 return true;
3415 }
3416
3417
3418
3419
3420 static class MetaEntry extends HRegionInfo {
3421 ServerName regionServer;
3422 long modTime;
3423 HRegionInfo splitA, splitB;
3424
3425 public MetaEntry(HRegionInfo rinfo, ServerName regionServer, long modTime) {
3426 this(rinfo, regionServer, modTime, null, null);
3427 }
3428
3429 public MetaEntry(HRegionInfo rinfo, ServerName regionServer, long modTime,
3430 HRegionInfo splitA, HRegionInfo splitB) {
3431 super(rinfo);
3432 this.regionServer = regionServer;
3433 this.modTime = modTime;
3434 this.splitA = splitA;
3435 this.splitB = splitB;
3436 }
3437
3438 @Override
3439 public boolean equals(Object o) {
3440 boolean superEq = super.equals(o);
3441 if (!superEq) {
3442 return superEq;
3443 }
3444
3445 MetaEntry me = (MetaEntry) o;
3446 if (!regionServer.equals(me.regionServer)) {
3447 return false;
3448 }
3449 return (modTime == me.modTime);
3450 }
3451
3452 @Override
3453 public int hashCode() {
3454 int hash = Arrays.hashCode(getRegionName());
3455 hash ^= getRegionId();
3456 hash ^= Arrays.hashCode(getStartKey());
3457 hash ^= Arrays.hashCode(getEndKey());
3458 hash ^= Boolean.valueOf(isOffline()).hashCode();
3459 hash ^= getTable().hashCode();
3460 if (regionServer != null) {
3461 hash ^= regionServer.hashCode();
3462 }
3463 hash ^= modTime;
3464 return hash;
3465 }
3466 }
3467
3468
3469
3470
3471 static class HdfsEntry {
3472 HRegionInfo hri;
3473 Path hdfsRegionDir = null;
3474 long hdfsRegionDirModTime = 0;
3475 boolean hdfsRegioninfoFilePresent = false;
3476 boolean hdfsOnlyEdits = false;
3477 }
3478
3479
3480
3481
3482 static class OnlineEntry {
3483 HRegionInfo hri;
3484 ServerName hsa;
3485
3486 @Override
3487 public String toString() {
3488 return hsa.toString() + ";" + hri.getRegionNameAsString();
3489 }
3490 }
3491
3492
3493
3494
3495
3496 public static class HbckInfo implements KeyRange {
3497 private MetaEntry metaEntry = null;
3498 private HdfsEntry hdfsEntry = null;
3499 private List<OnlineEntry> deployedEntries = Lists.newArrayList();
3500 private List<ServerName> deployedOn = Lists.newArrayList();
3501 private boolean skipChecks = false;
3502 private boolean isMerged = false;
3503 private int deployedReplicaId = HRegionInfo.DEFAULT_REPLICA_ID;
3504 private HRegionInfo primaryHRIForDeployedReplica = null;
3505
3506 HbckInfo(MetaEntry metaEntry) {
3507 this.metaEntry = metaEntry;
3508 }
3509
3510 public int getReplicaId() {
3511 if (metaEntry != null) return metaEntry.getReplicaId();
3512 return deployedReplicaId;
3513 }
3514
3515 public synchronized void addServer(HRegionInfo hri, ServerName server) {
3516 OnlineEntry rse = new OnlineEntry() ;
3517 rse.hri = hri;
3518 rse.hsa = server;
3519 this.deployedEntries.add(rse);
3520 this.deployedOn.add(server);
3521
3522 this.deployedReplicaId = hri.getReplicaId();
3523 this.primaryHRIForDeployedReplica =
3524 RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
3525 }
3526
3527 @Override
3528 public synchronized String toString() {
3529 StringBuilder sb = new StringBuilder();
3530 sb.append("{ meta => ");
3531 sb.append((metaEntry != null)? metaEntry.getRegionNameAsString() : "null");
3532 sb.append( ", hdfs => " + getHdfsRegionDir());
3533 sb.append( ", deployed => " + Joiner.on(", ").join(deployedEntries));
3534 sb.append( ", replicaId => " + getReplicaId());
3535 sb.append(" }");
3536 return sb.toString();
3537 }
3538
3539 @Override
3540 public byte[] getStartKey() {
3541 if (this.metaEntry != null) {
3542 return this.metaEntry.getStartKey();
3543 } else if (this.hdfsEntry != null) {
3544 return this.hdfsEntry.hri.getStartKey();
3545 } else {
3546 LOG.error("Entry " + this + " has no meta or hdfs region start key.");
3547 return null;
3548 }
3549 }
3550
3551 @Override
3552 public byte[] getEndKey() {
3553 if (this.metaEntry != null) {
3554 return this.metaEntry.getEndKey();
3555 } else if (this.hdfsEntry != null) {
3556 return this.hdfsEntry.hri.getEndKey();
3557 } else {
3558 LOG.error("Entry " + this + " has no meta or hdfs region start key.");
3559 return null;
3560 }
3561 }
3562
3563 public TableName getTableName() {
3564 if (this.metaEntry != null) {
3565 return this.metaEntry.getTable();
3566 } else if (this.hdfsEntry != null) {
3567
3568
3569 Path tableDir = this.hdfsEntry.hdfsRegionDir.getParent();
3570 return FSUtils.getTableName(tableDir);
3571 } else {
3572
3573 for (OnlineEntry e : deployedEntries) {
3574 return e.hri.getTable();
3575 }
3576 return null;
3577 }
3578 }
3579
3580 public String getRegionNameAsString() {
3581 if (metaEntry != null) {
3582 return metaEntry.getRegionNameAsString();
3583 } else if (hdfsEntry != null) {
3584 if (hdfsEntry.hri != null) {
3585 return hdfsEntry.hri.getRegionNameAsString();
3586 }
3587 } else {
3588
3589 for (OnlineEntry e : deployedEntries) {
3590 return e.hri.getRegionNameAsString();
3591 }
3592 }
3593 return null;
3594 }
3595
3596 public byte[] getRegionName() {
3597 if (metaEntry != null) {
3598 return metaEntry.getRegionName();
3599 } else if (hdfsEntry != null) {
3600 return hdfsEntry.hri.getRegionName();
3601 } else {
3602
3603 for (OnlineEntry e : deployedEntries) {
3604 return e.hri.getRegionName();
3605 }
3606 return null;
3607 }
3608 }
3609
3610 public HRegionInfo getPrimaryHRIForDeployedReplica() {
3611 return primaryHRIForDeployedReplica;
3612 }
3613
3614 Path getHdfsRegionDir() {
3615 if (hdfsEntry == null) {
3616 return null;
3617 }
3618 return hdfsEntry.hdfsRegionDir;
3619 }
3620
3621 boolean containsOnlyHdfsEdits() {
3622 if (hdfsEntry == null) {
3623 return false;
3624 }
3625 return hdfsEntry.hdfsOnlyEdits;
3626 }
3627
3628 boolean isHdfsRegioninfoPresent() {
3629 if (hdfsEntry == null) {
3630 return false;
3631 }
3632 return hdfsEntry.hdfsRegioninfoFilePresent;
3633 }
3634
3635 long getModTime() {
3636 if (hdfsEntry == null) {
3637 return 0;
3638 }
3639 return hdfsEntry.hdfsRegionDirModTime;
3640 }
3641
3642 HRegionInfo getHdfsHRI() {
3643 if (hdfsEntry == null) {
3644 return null;
3645 }
3646 return hdfsEntry.hri;
3647 }
3648
3649 public void setSkipChecks(boolean skipChecks) {
3650 this.skipChecks = skipChecks;
3651 }
3652
3653 public boolean isSkipChecks() {
3654 return skipChecks;
3655 }
3656
3657 public void setMerged(boolean isMerged) {
3658 this.isMerged = isMerged;
3659 }
3660
3661 public boolean isMerged() {
3662 return this.isMerged;
3663 }
3664 }
3665
3666 final static Comparator<HbckInfo> cmp = new Comparator<HbckInfo>() {
3667 @Override
3668 public int compare(HbckInfo l, HbckInfo r) {
3669 if (l == r) {
3670
3671 return 0;
3672 }
3673
3674 int tableCompare = l.getTableName().compareTo(r.getTableName());
3675 if (tableCompare != 0) {
3676 return tableCompare;
3677 }
3678
3679 int startComparison = RegionSplitCalculator.BYTES_COMPARATOR.compare(
3680 l.getStartKey(), r.getStartKey());
3681 if (startComparison != 0) {
3682 return startComparison;
3683 }
3684
3685
3686 byte[] endKey = r.getEndKey();
3687 endKey = (endKey.length == 0) ? null : endKey;
3688 byte[] endKey2 = l.getEndKey();
3689 endKey2 = (endKey2.length == 0) ? null : endKey2;
3690 int endComparison = RegionSplitCalculator.BYTES_COMPARATOR.compare(
3691 endKey2, endKey);
3692
3693 if (endComparison != 0) {
3694 return endComparison;
3695 }
3696
3697
3698
3699 if (l.hdfsEntry == null && r.hdfsEntry == null) {
3700 return 0;
3701 }
3702 if (l.hdfsEntry == null && r.hdfsEntry != null) {
3703 return 1;
3704 }
3705
3706 if (r.hdfsEntry == null) {
3707 return -1;
3708 }
3709
3710 return (int) (l.hdfsEntry.hri.getRegionId()- r.hdfsEntry.hri.getRegionId());
3711 }
3712 };
3713
3714
3715
3716
3717 private void printTableSummary(SortedMap<TableName, TableInfo> tablesInfo) {
3718 StringBuilder sb = new StringBuilder();
3719 errors.print("Summary:");
3720 for (TableInfo tInfo : tablesInfo.values()) {
3721 if (errors.tableHasErrors(tInfo)) {
3722 errors.print("Table " + tInfo.getName() + " is inconsistent.");
3723 } else {
3724 errors.print(" " + tInfo.getName() + " is okay.");
3725 }
3726 errors.print(" Number of regions: " + tInfo.getNumRegions());
3727 sb.setLength(0);
3728 sb.append(" Deployed on: ");
3729 for (ServerName server : tInfo.deployedOn) {
3730 sb.append(" " + server.toString());
3731 }
3732 errors.print(sb.toString());
3733 }
3734 }
3735
3736 static ErrorReporter getErrorReporter(
3737 final Configuration conf) throws ClassNotFoundException {
3738 Class<? extends ErrorReporter> reporter = conf.getClass("hbasefsck.errorreporter", PrintingErrorReporter.class, ErrorReporter.class);
3739 return ReflectionUtils.newInstance(reporter, conf);
3740 }
3741
3742 public interface ErrorReporter {
3743 enum ERROR_CODE {
3744 UNKNOWN, NO_META_REGION, NULL_META_REGION, NO_VERSION_FILE, NOT_IN_META_HDFS, NOT_IN_META,
3745 NOT_IN_META_OR_DEPLOYED, NOT_IN_HDFS_OR_DEPLOYED, NOT_IN_HDFS, SERVER_DOES_NOT_MATCH_META, NOT_DEPLOYED,
3746 MULTI_DEPLOYED, SHOULD_NOT_BE_DEPLOYED, MULTI_META_REGION, RS_CONNECT_FAILURE,
3747 FIRST_REGION_STARTKEY_NOT_EMPTY, LAST_REGION_ENDKEY_NOT_EMPTY, DUPE_STARTKEYS,
3748 HOLE_IN_REGION_CHAIN, OVERLAP_IN_REGION_CHAIN, REGION_CYCLE, DEGENERATE_REGION,
3749 ORPHAN_HDFS_REGION, LINGERING_SPLIT_PARENT, NO_TABLEINFO_FILE, LINGERING_REFERENCE_HFILE,
3750 WRONG_USAGE, EMPTY_META_CELL, EXPIRED_TABLE_LOCK, ORPHANED_ZK_TABLE_ENTRY, BOUNDARIES_ERROR
3751 }
3752 void clear();
3753 void report(String message);
3754 void reportError(String message);
3755 void reportError(ERROR_CODE errorCode, String message);
3756 void reportError(ERROR_CODE errorCode, String message, TableInfo table);
3757 void reportError(ERROR_CODE errorCode, String message, TableInfo table, HbckInfo info);
3758 void reportError(
3759 ERROR_CODE errorCode,
3760 String message,
3761 TableInfo table,
3762 HbckInfo info1,
3763 HbckInfo info2
3764 );
3765 int summarize();
3766 void detail(String details);
3767 ArrayList<ERROR_CODE> getErrorList();
3768 void progress();
3769 void print(String message);
3770 void resetErrors();
3771 boolean tableHasErrors(TableInfo table);
3772 }
3773
3774 static class PrintingErrorReporter implements ErrorReporter {
3775 public int errorCount = 0;
3776 private int showProgress;
3777
3778 private static final int progressThreshold = 100;
3779
3780 Set<TableInfo> errorTables = new HashSet<TableInfo>();
3781
3782
3783 private ArrayList<ERROR_CODE> errorList = new ArrayList<ERROR_CODE>();
3784
3785 @Override
3786 public void clear() {
3787 errorTables.clear();
3788 errorList.clear();
3789 errorCount = 0;
3790 }
3791
3792 @Override
3793 public synchronized void reportError(ERROR_CODE errorCode, String message) {
3794 if (errorCode == ERROR_CODE.WRONG_USAGE) {
3795 System.err.println(message);
3796 return;
3797 }
3798
3799 errorList.add(errorCode);
3800 if (!summary) {
3801 System.out.println("ERROR: " + message);
3802 }
3803 errorCount++;
3804 showProgress = 0;
3805 }
3806
3807 @Override
3808 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
3809 errorTables.add(table);
3810 reportError(errorCode, message);
3811 }
3812
3813 @Override
3814 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table,
3815 HbckInfo info) {
3816 errorTables.add(table);
3817 String reference = "(region " + info.getRegionNameAsString() + ")";
3818 reportError(errorCode, reference + " " + message);
3819 }
3820
3821 @Override
3822 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table,
3823 HbckInfo info1, HbckInfo info2) {
3824 errorTables.add(table);
3825 String reference = "(regions " + info1.getRegionNameAsString()
3826 + " and " + info2.getRegionNameAsString() + ")";
3827 reportError(errorCode, reference + " " + message);
3828 }
3829
3830 @Override
3831 public synchronized void reportError(String message) {
3832 reportError(ERROR_CODE.UNKNOWN, message);
3833 }
3834
3835
3836
3837
3838
3839
3840 @Override
3841 public synchronized void report(String message) {
3842 if (! summary) {
3843 System.out.println("ERROR: " + message);
3844 }
3845 showProgress = 0;
3846 }
3847
3848 @Override
3849 public synchronized int summarize() {
3850 System.out.println(Integer.toString(errorCount) +
3851 " inconsistencies detected.");
3852 if (errorCount == 0) {
3853 System.out.println("Status: OK");
3854 return 0;
3855 } else {
3856 System.out.println("Status: INCONSISTENT");
3857 return -1;
3858 }
3859 }
3860
3861 @Override
3862 public ArrayList<ERROR_CODE> getErrorList() {
3863 return errorList;
3864 }
3865
3866 @Override
3867 public synchronized void print(String message) {
3868 if (!summary) {
3869 System.out.println(message);
3870 }
3871 }
3872
3873 @Override
3874 public boolean tableHasErrors(TableInfo table) {
3875 return errorTables.contains(table);
3876 }
3877
3878 @Override
3879 public void resetErrors() {
3880 errorCount = 0;
3881 }
3882
3883 @Override
3884 public synchronized void detail(String message) {
3885 if (details) {
3886 System.out.println(message);
3887 }
3888 showProgress = 0;
3889 }
3890
3891 @Override
3892 public synchronized void progress() {
3893 if (showProgress++ == progressThreshold) {
3894 if (!summary) {
3895 System.out.print(".");
3896 }
3897 showProgress = 0;
3898 }
3899 }
3900 }
3901
3902
3903
3904
3905 static class WorkItemRegion implements Callable<Void> {
3906 private HBaseFsck hbck;
3907 private ServerName rsinfo;
3908 private ErrorReporter errors;
3909 private HConnection connection;
3910
3911 WorkItemRegion(HBaseFsck hbck, ServerName info,
3912 ErrorReporter errors, HConnection connection) {
3913 this.hbck = hbck;
3914 this.rsinfo = info;
3915 this.errors = errors;
3916 this.connection = connection;
3917 }
3918
3919 @Override
3920 public synchronized Void call() throws IOException {
3921 errors.progress();
3922 try {
3923 BlockingInterface server = connection.getAdmin(rsinfo);
3924
3925
3926 List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
3927 regions = filterRegions(regions);
3928
3929 if (details) {
3930 errors.detail("RegionServer: " + rsinfo.getServerName() +
3931 " number of regions: " + regions.size());
3932 for (HRegionInfo rinfo: regions) {
3933 errors.detail(" " + rinfo.getRegionNameAsString() +
3934 " id: " + rinfo.getRegionId() +
3935 " encoded_name: " + rinfo.getEncodedName() +
3936 " start: " + Bytes.toStringBinary(rinfo.getStartKey()) +
3937 " end: " + Bytes.toStringBinary(rinfo.getEndKey()));
3938 }
3939 }
3940
3941
3942 for (HRegionInfo r:regions) {
3943 HbckInfo hbi = hbck.getOrCreateInfo(r.getEncodedName());
3944 hbi.addServer(r, rsinfo);
3945 }
3946 } catch (IOException e) {
3947 errors.reportError(ERROR_CODE.RS_CONNECT_FAILURE, "RegionServer: " + rsinfo.getServerName() +
3948 " Unable to fetch region information. " + e);
3949 throw e;
3950 }
3951 return null;
3952 }
3953
3954 private List<HRegionInfo> filterRegions(List<HRegionInfo> regions) {
3955 List<HRegionInfo> ret = Lists.newArrayList();
3956 for (HRegionInfo hri : regions) {
3957 if (hri.isMetaTable() || (!hbck.checkMetaOnly
3958 && hbck.isTableIncluded(hri.getTable()))) {
3959 ret.add(hri);
3960 }
3961 }
3962 return ret;
3963 }
3964 }
3965
3966
3967
3968
3969
3970 static class WorkItemHdfsDir implements Callable<Void> {
3971 private HBaseFsck hbck;
3972 private FileStatus tableDir;
3973 private ErrorReporter errors;
3974 private FileSystem fs;
3975
3976 WorkItemHdfsDir(HBaseFsck hbck, FileSystem fs, ErrorReporter errors,
3977 FileStatus status) {
3978 this.hbck = hbck;
3979 this.fs = fs;
3980 this.tableDir = status;
3981 this.errors = errors;
3982 }
3983
3984 @Override
3985 public synchronized Void call() throws IOException {
3986 try {
3987
3988 FileStatus[] regionDirs = fs.listStatus(tableDir.getPath());
3989 for (FileStatus regionDir : regionDirs) {
3990 errors.progress();
3991 String encodedName = regionDir.getPath().getName();
3992
3993 if (!encodedName.toLowerCase().matches("[0-9a-f]+")) {
3994 continue;
3995 }
3996
3997 LOG.debug("Loading region info from hdfs:"+ regionDir.getPath());
3998 HbckInfo hbi = hbck.getOrCreateInfo(encodedName);
3999 HdfsEntry he = new HdfsEntry();
4000 synchronized (hbi) {
4001 if (hbi.getHdfsRegionDir() != null) {
4002 errors.print("Directory " + encodedName + " duplicate??" +
4003 hbi.getHdfsRegionDir());
4004 }
4005
4006 he.hdfsRegionDir = regionDir.getPath();
4007 he.hdfsRegionDirModTime = regionDir.getModificationTime();
4008 Path regioninfoFile = new Path(he.hdfsRegionDir, HRegionFileSystem.REGION_INFO_FILE);
4009 he.hdfsRegioninfoFilePresent = fs.exists(regioninfoFile);
4010
4011
4012
4013
4014 he.hdfsOnlyEdits = true;
4015 FileStatus[] subDirs = fs.listStatus(regionDir.getPath());
4016 Path ePath = WALSplitter.getRegionDirRecoveredEditsDir(regionDir.getPath());
4017 for (FileStatus subDir : subDirs) {
4018 errors.progress();
4019 String sdName = subDir.getPath().getName();
4020 if (!sdName.startsWith(".") && !sdName.equals(ePath.getName())) {
4021 he.hdfsOnlyEdits = false;
4022 break;
4023 }
4024 }
4025 hbi.hdfsEntry = he;
4026 }
4027 }
4028 } catch (IOException e) {
4029
4030 errors.reportError(ERROR_CODE.RS_CONNECT_FAILURE, "Table Directory: "
4031 + tableDir.getPath().getName()
4032 + " Unable to fetch region information. " + e);
4033 throw e;
4034 }
4035 return null;
4036 }
4037 }
4038
4039
4040
4041
4042
4043 static class WorkItemHdfsRegionInfo implements Callable<Void> {
4044 private HbckInfo hbi;
4045 private HBaseFsck hbck;
4046 private ErrorReporter errors;
4047
4048 WorkItemHdfsRegionInfo(HbckInfo hbi, HBaseFsck hbck, ErrorReporter errors) {
4049 this.hbi = hbi;
4050 this.hbck = hbck;
4051 this.errors = errors;
4052 }
4053
4054 @Override
4055 public synchronized Void call() throws IOException {
4056
4057 if (hbi.getHdfsHRI() == null) {
4058 try {
4059 errors.progress();
4060 hbck.loadHdfsRegioninfo(hbi);
4061 } catch (IOException ioe) {
4062 String msg = "Orphan region in HDFS: Unable to load .regioninfo from table "
4063 + hbi.getTableName() + " in hdfs dir "
4064 + hbi.getHdfsRegionDir()
4065 + "! It may be an invalid format or version file. Treating as "
4066 + "an orphaned regiondir.";
4067 errors.reportError(ERROR_CODE.ORPHAN_HDFS_REGION, msg);
4068 try {
4069 hbck.debugLsr(hbi.getHdfsRegionDir());
4070 } catch (IOException ioe2) {
4071 LOG.error("Unable to read directory " + hbi.getHdfsRegionDir(), ioe2);
4072 throw ioe2;
4073 }
4074 hbck.orphanHdfsDirs.add(hbi);
4075 throw ioe;
4076 }
4077 }
4078 return null;
4079 }
4080 };
4081
4082
4083
4084
4085
4086 public static void setDisplayFullReport() {
4087 details = true;
4088 }
4089
4090
4091
4092
4093
4094 void setSummary() {
4095 summary = true;
4096 }
4097
4098
4099
4100
4101
4102 void setCheckMetaOnly() {
4103 checkMetaOnly = true;
4104 }
4105
4106
4107
4108
4109 void setRegionBoundariesCheck() {
4110 checkRegionBoundaries = true;
4111 }
4112
4113
4114
4115
4116
4117 public void setFixTableLocks(boolean shouldFix) {
4118 fixTableLocks = shouldFix;
4119 fixAny |= shouldFix;
4120 }
4121
4122
4123
4124
4125
4126 public void setFixTableZNodes(boolean shouldFix) {
4127 fixTableZNodes = shouldFix;
4128 fixAny |= shouldFix;
4129 }
4130
4131
4132
4133
4134
4135
4136
4137 void setShouldRerun() {
4138 rerun = true;
4139 }
4140
4141 boolean shouldRerun() {
4142 return rerun;
4143 }
4144
4145
4146
4147
4148
4149 public void setFixAssignments(boolean shouldFix) {
4150 fixAssignments = shouldFix;
4151 fixAny |= shouldFix;
4152 }
4153
4154 boolean shouldFixAssignments() {
4155 return fixAssignments;
4156 }
4157
4158 public void setFixMeta(boolean shouldFix) {
4159 fixMeta = shouldFix;
4160 fixAny |= shouldFix;
4161 }
4162
4163 boolean shouldFixMeta() {
4164 return fixMeta;
4165 }
4166
4167 public void setFixEmptyMetaCells(boolean shouldFix) {
4168 fixEmptyMetaCells = shouldFix;
4169 fixAny |= shouldFix;
4170 }
4171
4172 boolean shouldFixEmptyMetaCells() {
4173 return fixEmptyMetaCells;
4174 }
4175
4176 public void setCheckHdfs(boolean checking) {
4177 checkHdfs = checking;
4178 }
4179
4180 boolean shouldCheckHdfs() {
4181 return checkHdfs;
4182 }
4183
4184 public void setFixHdfsHoles(boolean shouldFix) {
4185 fixHdfsHoles = shouldFix;
4186 fixAny |= shouldFix;
4187 }
4188
4189 boolean shouldFixHdfsHoles() {
4190 return fixHdfsHoles;
4191 }
4192
4193 public void setFixTableOrphans(boolean shouldFix) {
4194 fixTableOrphans = shouldFix;
4195 fixAny |= shouldFix;
4196 }
4197
4198 boolean shouldFixTableOrphans() {
4199 return fixTableOrphans;
4200 }
4201
4202 public void setFixHdfsOverlaps(boolean shouldFix) {
4203 fixHdfsOverlaps = shouldFix;
4204 fixAny |= shouldFix;
4205 }
4206
4207 boolean shouldFixHdfsOverlaps() {
4208 return fixHdfsOverlaps;
4209 }
4210
4211 public void setFixHdfsOrphans(boolean shouldFix) {
4212 fixHdfsOrphans = shouldFix;
4213 fixAny |= shouldFix;
4214 }
4215
4216 boolean shouldFixHdfsOrphans() {
4217 return fixHdfsOrphans;
4218 }
4219
4220 public void setFixVersionFile(boolean shouldFix) {
4221 fixVersionFile = shouldFix;
4222 fixAny |= shouldFix;
4223 }
4224
4225 public boolean shouldFixVersionFile() {
4226 return fixVersionFile;
4227 }
4228
4229 public void setSidelineBigOverlaps(boolean sbo) {
4230 this.sidelineBigOverlaps = sbo;
4231 }
4232
4233 public boolean shouldSidelineBigOverlaps() {
4234 return sidelineBigOverlaps;
4235 }
4236
4237 public void setFixSplitParents(boolean shouldFix) {
4238 fixSplitParents = shouldFix;
4239 fixAny |= shouldFix;
4240 }
4241
4242 boolean shouldFixSplitParents() {
4243 return fixSplitParents;
4244 }
4245
4246 public void setFixReferenceFiles(boolean shouldFix) {
4247 fixReferenceFiles = shouldFix;
4248 fixAny |= shouldFix;
4249 }
4250
4251 boolean shouldFixReferenceFiles() {
4252 return fixReferenceFiles;
4253 }
4254
4255 public boolean shouldIgnorePreCheckPermission() {
4256 return !fixAny || ignorePreCheckPermission;
4257 }
4258
4259 public void setIgnorePreCheckPermission(boolean ignorePreCheckPermission) {
4260 this.ignorePreCheckPermission = ignorePreCheckPermission;
4261 }
4262
4263
4264
4265
4266 public void setMaxMerge(int mm) {
4267 this.maxMerge = mm;
4268 }
4269
4270 public int getMaxMerge() {
4271 return maxMerge;
4272 }
4273
4274 public void setMaxOverlapsToSideline(int mo) {
4275 this.maxOverlapsToSideline = mo;
4276 }
4277
4278 public int getMaxOverlapsToSideline() {
4279 return maxOverlapsToSideline;
4280 }
4281
4282
4283
4284
4285
4286 boolean isTableIncluded(TableName table) {
4287 return (tablesIncluded.size() == 0) || tablesIncluded.contains(table);
4288 }
4289
4290 public void includeTable(TableName table) {
4291 tablesIncluded.add(table);
4292 }
4293
4294 Set<TableName> getIncludedTables() {
4295 return new HashSet<TableName>(tablesIncluded);
4296 }
4297
4298
4299
4300
4301
4302
4303 public void setTimeLag(long seconds) {
4304 timelag = seconds * 1000;
4305 }
4306
4307
4308
4309
4310
4311 public void setSidelineDir(String sidelineDir) {
4312 this.sidelineDir = new Path(sidelineDir);
4313 }
4314
4315 protected HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
4316 return new HFileCorruptionChecker(getConf(), executor, sidelineCorruptHFiles);
4317 }
4318
4319 public HFileCorruptionChecker getHFilecorruptionChecker() {
4320 return hfcc;
4321 }
4322
4323 public void setHFileCorruptionChecker(HFileCorruptionChecker hfcc) {
4324 this.hfcc = hfcc;
4325 }
4326
4327 public void setRetCode(int code) {
4328 this.retcode = code;
4329 }
4330
4331 public int getRetCode() {
4332 return retcode;
4333 }
4334
4335 protected HBaseFsck printUsageAndExit() {
4336 StringWriter sw = new StringWriter(2048);
4337 PrintWriter out = new PrintWriter(sw);
4338 out.println("Usage: fsck [opts] {only tables}");
4339 out.println(" where [opts] are:");
4340 out.println(" -help Display help options (this)");
4341 out.println(" -details Display full report of all regions.");
4342 out.println(" -timelag <timeInSeconds> Process only regions that " +
4343 " have not experienced any metadata updates in the last " +
4344 " <timeInSeconds> seconds.");
4345 out.println(" -sleepBeforeRerun <timeInSeconds> Sleep this many seconds" +
4346 " before checking if the fix worked if run with -fix");
4347 out.println(" -summary Print only summary of the tables and status.");
4348 out.println(" -metaonly Only check the state of the hbase:meta table.");
4349 out.println(" -sidelineDir <hdfs://> HDFS path to backup existing meta.");
4350 out.println(" -boundaries Verify that regions boundaries are the same between META and store files.");
4351
4352 out.println("");
4353 out.println(" Metadata Repair options: (expert features, use with caution!)");
4354 out.println(" -fix Try to fix region assignments. This is for backwards compatiblity");
4355 out.println(" -fixAssignments Try to fix region assignments. Replaces the old -fix");
4356 out.println(" -fixMeta Try to fix meta problems. This assumes HDFS region info is good.");
4357 out.println(" -noHdfsChecking Don't load/check region info from HDFS."
4358 + " Assumes hbase:meta region info is good. Won't check/fix any HDFS issue, e.g. hole, orphan, or overlap");
4359 out.println(" -fixHdfsHoles Try to fix region holes in hdfs.");
4360 out.println(" -fixHdfsOrphans Try to fix region dirs with no .regioninfo file in hdfs");
4361 out.println(" -fixTableOrphans Try to fix table dirs with no .tableinfo file in hdfs (online mode only)");
4362 out.println(" -fixHdfsOverlaps Try to fix region overlaps in hdfs.");
4363 out.println(" -fixVersionFile Try to fix missing hbase.version file in hdfs.");
4364 out.println(" -maxMerge <n> When fixing region overlaps, allow at most <n> regions to merge. (n=" + DEFAULT_MAX_MERGE +" by default)");
4365 out.println(" -sidelineBigOverlaps When fixing region overlaps, allow to sideline big overlaps");
4366 out.println(" -maxOverlapsToSideline <n> When fixing region overlaps, allow at most <n> regions to sideline per group. (n=" + DEFAULT_OVERLAPS_TO_SIDELINE +" by default)");
4367 out.println(" -fixSplitParents Try to force offline split parents to be online.");
4368 out.println(" -ignorePreCheckPermission ignore filesystem permission pre-check");
4369 out.println(" -fixReferenceFiles Try to offline lingering reference store files");
4370 out.println(" -fixEmptyMetaCells Try to fix hbase:meta entries not referencing any region"
4371 + " (empty REGIONINFO_QUALIFIER rows)");
4372
4373 out.println("");
4374 out.println(" Datafile Repair options: (expert features, use with caution!)");
4375 out.println(" -checkCorruptHFiles Check all Hfiles by opening them to make sure they are valid");
4376 out.println(" -sidelineCorruptHFiles Quarantine corrupted HFiles. implies -checkCorruptHFiles");
4377
4378 out.println("");
4379 out.println(" Metadata Repair shortcuts");
4380 out.println(" -repair Shortcut for -fixAssignments -fixMeta -fixHdfsHoles " +
4381 "-fixHdfsOrphans -fixHdfsOverlaps -fixVersionFile -sidelineBigOverlaps " +
4382 "-fixReferenceFiles -fixTableLocks -fixOrphanedTableZnodes");
4383 out.println(" -repairHoles Shortcut for -fixAssignments -fixMeta -fixHdfsHoles");
4384
4385 out.println("");
4386 out.println(" Table lock options");
4387 out.println(" -fixTableLocks Deletes table locks held for a long time (hbase.table.lock.expire.ms, 10min by default)");
4388
4389 out.println("");
4390 out.println(" Table Znode options");
4391 out.println(" -fixOrphanedTableZnodes Set table state in ZNode to disabled if table does not exists");
4392
4393 out.flush();
4394 errors.reportError(ERROR_CODE.WRONG_USAGE, sw.toString());
4395
4396 setRetCode(-2);
4397 return this;
4398 }
4399
4400
4401
4402
4403
4404
4405
4406 public static void main(String[] args) throws Exception {
4407
4408 Configuration conf = HBaseConfiguration.create();
4409 Path hbasedir = FSUtils.getRootDir(conf);
4410 URI defaultFs = hbasedir.getFileSystem(conf).getUri();
4411 FSUtils.setFsDefault(conf, new Path(defaultFs));
4412 int ret = ToolRunner.run(new HBaseFsckTool(conf), args);
4413 System.exit(ret);
4414 }
4415
4416
4417
4418
4419 static class HBaseFsckTool extends Configured implements Tool {
4420 HBaseFsckTool(Configuration conf) { super(conf); }
4421 @Override
4422 public int run(String[] args) throws Exception {
4423 HBaseFsck hbck = new HBaseFsck(getConf());
4424 hbck.exec(hbck.executor, args);
4425 hbck.close();
4426 return hbck.getRetCode();
4427 }
4428 };
4429
4430
4431 public HBaseFsck exec(ExecutorService exec, String[] args) throws KeeperException, IOException,
4432 ServiceException, InterruptedException {
4433 long sleepBeforeRerun = DEFAULT_SLEEP_BEFORE_RERUN;
4434
4435 boolean checkCorruptHFiles = false;
4436 boolean sidelineCorruptHFiles = false;
4437
4438
4439 for (int i = 0; i < args.length; i++) {
4440 String cmd = args[i];
4441 if (cmd.equals("-help") || cmd.equals("-h")) {
4442 return printUsageAndExit();
4443 } else if (cmd.equals("-details")) {
4444 setDisplayFullReport();
4445 } else if (cmd.equals("-timelag")) {
4446 if (i == args.length - 1) {
4447 errors.reportError(ERROR_CODE.WRONG_USAGE, "HBaseFsck: -timelag needs a value.");
4448 return printUsageAndExit();
4449 }
4450 try {
4451 long timelag = Long.parseLong(args[i+1]);
4452 setTimeLag(timelag);
4453 } catch (NumberFormatException e) {
4454 errors.reportError(ERROR_CODE.WRONG_USAGE, "-timelag needs a numeric value.");
4455 return printUsageAndExit();
4456 }
4457 i++;
4458 } else if (cmd.equals("-sleepBeforeRerun")) {
4459 if (i == args.length - 1) {
4460 errors.reportError(ERROR_CODE.WRONG_USAGE,
4461 "HBaseFsck: -sleepBeforeRerun needs a value.");
4462 return printUsageAndExit();
4463 }
4464 try {
4465 sleepBeforeRerun = Long.parseLong(args[i+1]);
4466 } catch (NumberFormatException e) {
4467 errors.reportError(ERROR_CODE.WRONG_USAGE, "-sleepBeforeRerun needs a numeric value.");
4468 return printUsageAndExit();
4469 }
4470 i++;
4471 } else if (cmd.equals("-sidelineDir")) {
4472 if (i == args.length - 1) {
4473 errors.reportError(ERROR_CODE.WRONG_USAGE, "HBaseFsck: -sidelineDir needs a value.");
4474 return printUsageAndExit();
4475 }
4476 i++;
4477 setSidelineDir(args[i]);
4478 } else if (cmd.equals("-fix")) {
4479 errors.reportError(ERROR_CODE.WRONG_USAGE,
4480 "This option is deprecated, please use -fixAssignments instead.");
4481 setFixAssignments(true);
4482 } else if (cmd.equals("-fixAssignments")) {
4483 setFixAssignments(true);
4484 } else if (cmd.equals("-fixMeta")) {
4485 setFixMeta(true);
4486 } else if (cmd.equals("-noHdfsChecking")) {
4487 setCheckHdfs(false);
4488 } else if (cmd.equals("-fixHdfsHoles")) {
4489 setFixHdfsHoles(true);
4490 } else if (cmd.equals("-fixHdfsOrphans")) {
4491 setFixHdfsOrphans(true);
4492 } else if (cmd.equals("-fixTableOrphans")) {
4493 setFixTableOrphans(true);
4494 } else if (cmd.equals("-fixHdfsOverlaps")) {
4495 setFixHdfsOverlaps(true);
4496 } else if (cmd.equals("-fixVersionFile")) {
4497 setFixVersionFile(true);
4498 } else if (cmd.equals("-sidelineBigOverlaps")) {
4499 setSidelineBigOverlaps(true);
4500 } else if (cmd.equals("-fixSplitParents")) {
4501 setFixSplitParents(true);
4502 } else if (cmd.equals("-ignorePreCheckPermission")) {
4503 setIgnorePreCheckPermission(true);
4504 } else if (cmd.equals("-checkCorruptHFiles")) {
4505 checkCorruptHFiles = true;
4506 } else if (cmd.equals("-sidelineCorruptHFiles")) {
4507 sidelineCorruptHFiles = true;
4508 } else if (cmd.equals("-fixReferenceFiles")) {
4509 setFixReferenceFiles(true);
4510 } else if (cmd.equals("-fixEmptyMetaCells")) {
4511 setFixEmptyMetaCells(true);
4512 } else if (cmd.equals("-repair")) {
4513
4514
4515 setFixHdfsHoles(true);
4516 setFixHdfsOrphans(true);
4517 setFixMeta(true);
4518 setFixAssignments(true);
4519 setFixHdfsOverlaps(true);
4520 setFixVersionFile(true);
4521 setSidelineBigOverlaps(true);
4522 setFixSplitParents(false);
4523 setCheckHdfs(true);
4524 setFixReferenceFiles(true);
4525 setFixTableLocks(true);
4526 setFixTableZNodes(true);
4527 } else if (cmd.equals("-repairHoles")) {
4528
4529 setFixHdfsHoles(true);
4530 setFixHdfsOrphans(false);
4531 setFixMeta(true);
4532 setFixAssignments(true);
4533 setFixHdfsOverlaps(false);
4534 setSidelineBigOverlaps(false);
4535 setFixSplitParents(false);
4536 setCheckHdfs(true);
4537 } else if (cmd.equals("-maxOverlapsToSideline")) {
4538 if (i == args.length - 1) {
4539 errors.reportError(ERROR_CODE.WRONG_USAGE,
4540 "-maxOverlapsToSideline needs a numeric value argument.");
4541 return printUsageAndExit();
4542 }
4543 try {
4544 int maxOverlapsToSideline = Integer.parseInt(args[i+1]);
4545 setMaxOverlapsToSideline(maxOverlapsToSideline);
4546 } catch (NumberFormatException e) {
4547 errors.reportError(ERROR_CODE.WRONG_USAGE,
4548 "-maxOverlapsToSideline needs a numeric value argument.");
4549 return printUsageAndExit();
4550 }
4551 i++;
4552 } else if (cmd.equals("-maxMerge")) {
4553 if (i == args.length - 1) {
4554 errors.reportError(ERROR_CODE.WRONG_USAGE,
4555 "-maxMerge needs a numeric value argument.");
4556 return printUsageAndExit();
4557 }
4558 try {
4559 int maxMerge = Integer.parseInt(args[i+1]);
4560 setMaxMerge(maxMerge);
4561 } catch (NumberFormatException e) {
4562 errors.reportError(ERROR_CODE.WRONG_USAGE,
4563 "-maxMerge needs a numeric value argument.");
4564 return printUsageAndExit();
4565 }
4566 i++;
4567 } else if (cmd.equals("-summary")) {
4568 setSummary();
4569 } else if (cmd.equals("-metaonly")) {
4570 setCheckMetaOnly();
4571 } else if (cmd.equals("-boundaries")) {
4572 setRegionBoundariesCheck();
4573 } else if (cmd.equals("-fixTableLocks")) {
4574 setFixTableLocks(true);
4575 } else if (cmd.equals("-fixOrphanedTableZnodes")) {
4576 setFixTableZNodes(true);
4577 } else if (cmd.startsWith("-")) {
4578 errors.reportError(ERROR_CODE.WRONG_USAGE, "Unrecognized option:" + cmd);
4579 return printUsageAndExit();
4580 } else {
4581 includeTable(TableName.valueOf(cmd));
4582 errors.print("Allow checking/fixes for table: " + cmd);
4583 }
4584 }
4585
4586 errors.print("HBaseFsck command line options: " + StringUtils.join(args, " "));
4587
4588
4589 try {
4590 preCheckPermission();
4591 } catch (AccessDeniedException ace) {
4592 Runtime.getRuntime().exit(-1);
4593 } catch (IOException ioe) {
4594 Runtime.getRuntime().exit(-1);
4595 }
4596
4597
4598 connect();
4599
4600 try {
4601
4602 if (checkCorruptHFiles || sidelineCorruptHFiles) {
4603 LOG.info("Checking all hfiles for corruption");
4604 HFileCorruptionChecker hfcc = createHFileCorruptionChecker(sidelineCorruptHFiles);
4605 setHFileCorruptionChecker(hfcc);
4606 Collection<TableName> tables = getIncludedTables();
4607 Collection<Path> tableDirs = new ArrayList<Path>();
4608 Path rootdir = FSUtils.getRootDir(getConf());
4609 if (tables.size() > 0) {
4610 for (TableName t : tables) {
4611 tableDirs.add(FSUtils.getTableDir(rootdir, t));
4612 }
4613 } else {
4614 tableDirs = FSUtils.getTableDirs(FSUtils.getCurrentFileSystem(getConf()), rootdir);
4615 }
4616 hfcc.checkTables(tableDirs);
4617 hfcc.report(errors);
4618 }
4619
4620
4621 int code = onlineHbck();
4622 setRetCode(code);
4623
4624
4625
4626
4627 if (shouldRerun()) {
4628 try {
4629 LOG.info("Sleeping " + sleepBeforeRerun + "ms before re-checking after fix...");
4630 Thread.sleep(sleepBeforeRerun);
4631 } catch (InterruptedException ie) {
4632 LOG.warn("Interrupted while sleeping");
4633 return this;
4634 }
4635
4636 setFixAssignments(false);
4637 setFixMeta(false);
4638 setFixHdfsHoles(false);
4639 setFixHdfsOverlaps(false);
4640 setFixVersionFile(false);
4641 setFixTableOrphans(false);
4642 errors.resetErrors();
4643 code = onlineHbck();
4644 setRetCode(code);
4645 }
4646 } finally {
4647 IOUtils.cleanup(null, this);
4648 }
4649 return this;
4650 }
4651
4652
4653
4654
4655 void debugLsr(Path p) throws IOException {
4656 debugLsr(getConf(), p, errors);
4657 }
4658
4659
4660
4661
4662 public static void debugLsr(Configuration conf,
4663 Path p) throws IOException {
4664 debugLsr(conf, p, new PrintingErrorReporter());
4665 }
4666
4667
4668
4669
4670 public static void debugLsr(Configuration conf,
4671 Path p, ErrorReporter errors) throws IOException {
4672 if (!LOG.isDebugEnabled() || p == null) {
4673 return;
4674 }
4675 FileSystem fs = p.getFileSystem(conf);
4676
4677 if (!fs.exists(p)) {
4678
4679 return;
4680 }
4681 errors.print(p.toString());
4682
4683 if (fs.isFile(p)) {
4684 return;
4685 }
4686
4687 if (fs.getFileStatus(p).isDirectory()) {
4688 FileStatus[] fss= fs.listStatus(p);
4689 for (FileStatus status : fss) {
4690 debugLsr(conf, status.getPath(), errors);
4691 }
4692 }
4693 }
4694 }