Skip to content

Commit 81e8c71

Browse files
committed
Bug 38279241 - [37442204->14.1.2.0.4] Snapshot validation passes for snapshots can't be recovered (missing partition folders aren't detected) (14.1.2.0 cl 118190 --> 14.1.2.0 CE)
[git-p4: depot-paths = "//dev/coherence-ce/release/coherence-ce-v14.1.2.0/": change = 118264]
1 parent 916cd62 commit 81e8c71

File tree

5 files changed

+285
-43
lines changed

5 files changed

+285
-43
lines changed

prj/coherence-core-components/src/main/java/com/tangosol/coherence/component/util/daemon/queueProcessor/service/grid/PartitionedService.java

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,9 @@
121121
import java.util.concurrent.CopyOnWriteArraySet;
122122
import java.util.concurrent.atomic.AtomicInteger;
123123
import java.util.concurrent.atomic.AtomicLong;
124+
import java.util.regex.Matcher;
125+
import java.util.regex.Pattern;
126+
import java.util.stream.Collectors;
124127

125128
import static com.tangosol.internal.util.VersionHelper.VERSION_12_2_1_4_25;
126129
import static com.tangosol.internal.util.VersionHelper.VERSION_14_1_1_0_21;
@@ -28980,11 +28983,43 @@ public String[] listFailedSnapshots()
2898028983
msgRequest.setSnapshotName(null);
2898128984
msgRequest.setFailed(true); // this ensures we get list of failed snapshots
2898228985

28983-
String[] asNames = (String[]) service.poll(msgRequest);
28984-
setSnapshotFailures(asNames);
28985-
return asNames;
28986+
String[] failedSnapshots = reconcileValidatedSnapshots((String[]) service.poll(msgRequest));
28987+
setSnapshotFailures(failedSnapshots);
28988+
return failedSnapshots;
2898628989
}
2898728990

28991+
private String[] reconcileValidatedSnapshots(String[] asNames)
28992+
{
28993+
if (asNames == null)
28994+
{
28995+
return null;
28996+
}
28997+
Set<String> failedSnapshots = new HashSet<>();
28998+
Map<String, Set<String>> snapPartitionsMap = new HashMap<>();
28999+
Pattern p = Pattern.compile("^(?<name>.+?)~~~\\[(?<partitions>(?:(?:\\d+-[0-9a-f]+-[0-9a-f]+-\\d+),?)+)\\]$");
29000+
for (String snapshot : asNames)
29001+
{
29002+
Matcher m = p.matcher(snapshot);
29003+
if (m.matches())
29004+
{
29005+
snapPartitionsMap.computeIfAbsent(m.group("name"), s -> new HashSet())
29006+
.addAll(Arrays.stream(m.group("partitions").split(","))
29007+
.collect(Collectors.toSet()));
29008+
}
29009+
else
29010+
{
29011+
// snapshot name w/o validated partition names
29012+
failedSnapshots.add(snapshot);
29013+
}
29014+
}
29015+
int partitionCount = getService().getPartitionCount();
29016+
List<String> failures = snapPartitionsMap.entrySet().stream()
29017+
.filter(entry -> entry.getValue().size() != partitionCount)
29018+
.map(Map.Entry::getKey)
29019+
.toList();
29020+
failedSnapshots.addAll(failures);
29021+
return failedSnapshots.toArray(new String[failedSnapshots.size()]);
29022+
}
2898829023
/**
2898929024
* Return a Map<Integer, String[]> where the key is the member id
2899029025
* and the value is the list of stores that are known by all members

prj/coherence-core/src/main/java/com/tangosol/persistence/AbstractPersistenceManager.java

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
import com.tangosol.util.Base;
4040
import com.tangosol.util.ClassHelper;
4141
import com.tangosol.util.NullImplementation;
42+
import com.tangosol.util.SimpleMapEntry;
4243

4344
import java.io.DataOutput;
4445
import java.io.EOFException;
@@ -49,7 +50,10 @@
4950
import java.nio.channels.FileLock;
5051

5152
import java.util.ArrayList;
53+
import java.util.Arrays;
54+
import java.util.Deque;
5255
import java.util.HashSet;
56+
import java.util.LinkedList;
5357
import java.util.List;
5458
import java.util.Map;
5559
import java.util.Properties;
@@ -576,24 +580,37 @@ public PersistenceTools getPersistenceTools()
576580
throw new IllegalArgumentException("snapshot must have at least one GUID");
577581
}
578582

579-
String sGUID = asGUIDs[0];
580-
int nVersion;
581-
int cPartitions;
583+
int nVersion;
584+
int cPartitions;
585+
FileLock fileLock = null;
586+
String sGUID = null;
582587

583588
PersistentStore<ReadBuffer> store = null;
584589

585590
try
586591
{
592+
Map.Entry<String, FileLock> entry = pickStore(asGUIDs);
593+
fileLock = entry.getValue();
594+
sGUID = entry.getKey();
595+
587596
store = open(sGUID, null);
588597
cPartitions = CachePersistenceHelper.getPartitionCount(store);
589598
nVersion = CachePersistenceHelper.getPersistenceVersion(store);
590599
}
600+
catch (Exception e)
601+
{
602+
throw Base.ensureRuntimeException(e);
603+
}
591604
finally
592605
{
593606
if (store != null)
594607
{
595608
close(sGUID);
596609
}
610+
if (fileLock != null)
611+
{
612+
FileHelper.unlockFile(fileLock);
613+
}
597614
}
598615

599616
OfflinePersistenceInfo info = new OfflinePersistenceInfo(cPartitions, getStorageFormat(),
@@ -602,6 +619,23 @@ public PersistenceTools getPersistenceTools()
602619
return instantiatePersistenceTools(info);
603620
}
604621

622+
private Map.Entry<String, FileLock> pickStore(String[] stores)
623+
{
624+
Deque<String> candidates = new LinkedList<>(Arrays.asList(stores));
625+
String guid;
626+
while((guid = candidates.poll()) != null)
627+
{
628+
File lockDir = getLockDirectory();
629+
File lockFile = new File(lockDir, guid + ".store.lck");
630+
FileLock fileLock = FileHelper.lockFile(lockFile);
631+
if (fileLock != null)
632+
{
633+
return new SimpleMapEntry<>(guid, fileLock);
634+
}
635+
}
636+
throw new RuntimeException("all stores are locked");
637+
}
638+
605639
/**
606640
* {@inheritDoc}
607641
*/

prj/coherence-core/src/main/java/com/tangosol/persistence/CachePersistenceHelper.java

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@
7575
import java.io.Reader;
7676
import java.io.Writer;
7777

78+
import java.lang.reflect.InvocationTargetException;
79+
import java.lang.reflect.Method;
80+
7881
import java.util.ArrayList;
7982
import java.util.List;
8083
import java.util.Map;
@@ -1205,6 +1208,22 @@ public static void resumeService(final Cluster cluster, final String sServiceNam
12051208
* @throws PersistenceException if any errors
12061209
*/
12071210
public static PersistenceTools getSnapshotPersistenceTools(File dirSnapshot)
1211+
{
1212+
return getSnapshotPersistenceTools(dirSnapshot, false);
1213+
}
1214+
1215+
/**
1216+
* Return an implementation specific instance of {@link PersistenceTools} for
1217+
* the given local snapshot directory.
1218+
*
1219+
* @param dirSnapshot the snapshot directory to get tools for
1220+
* @param validation whether to enable snapshot validation
1221+
*
1222+
* @return an implementation specific instance of PersistenceTools
1223+
*
1224+
* @throws PersistenceException if any errors
1225+
*/
1226+
public static PersistenceTools getSnapshotPersistenceTools(File dirSnapshot, boolean validation)
12081227
{
12091228
PersistenceTools tools;
12101229

@@ -1250,7 +1269,7 @@ public static PersistenceTools getSnapshotPersistenceTools(File dirSnapshot)
12501269

12511270
if ("BDB".equals(sPersistenceType))
12521271
{
1253-
tools = new BerkeleyDBManager(dirSnapshot, null, null).getPersistenceTools();
1272+
tools = new BerkeleyDBManager(dirSnapshot, null, null, validation).getPersistenceTools();
12541273
}
12551274
else
12561275
{
@@ -1721,7 +1740,26 @@ public static String[] getFailedSnapshots(PersistenceEnvironment env)
17211740
File dirSnapshot = new File(dirSnapshots, sName);
17221741
try
17231742
{
1724-
getSnapshotPersistenceTools(dirSnapshot).validate();
1743+
PersistenceTools pt = getSnapshotPersistenceTools(dirSnapshot, true);
1744+
try
1745+
{
1746+
// Using reflection because the PersistenceTools interface wasn't updated,
1747+
// and BerkeleyDBManager returns an anonymous class as the PersistenceTools
1748+
// implementation, so casting isn't possible.
1749+
Method validate = pt.getClass().getMethod("validateWithPartitions");
1750+
validate.setAccessible(true);
1751+
String[] checkedPartitions = (String[]) validate.invoke(pt);
1752+
String validationResult = String.format("%s~~~[%s]", sName, String.join(",", checkedPartitions));
1753+
asFailedSnapshots.add(validationResult);
1754+
}
1755+
catch (NoSuchMethodException e)
1756+
{
1757+
getSnapshotPersistenceTools(dirSnapshot, true).validate();
1758+
}
1759+
catch (InvocationTargetException | IllegalAccessException e)
1760+
{
1761+
throw Base.ensureRuntimeException(e);
1762+
}
17251763
}
17261764
catch (RuntimeException e)
17271765
{

0 commit comments

Comments
 (0)