Skip to content

Commit

Permalink
fix: Repair invalid ModelRecord lastUsed values in registry
Browse files Browse the repository at this point in the history
Motivation

Some model records were observed in the registry with an invalid lastUsed time of Long.MAX_VALUE, effectively pinning them at the front of the cache and causing them to remain loaded even if not used.

An attempt to fix this was made in #36, but this didn't end up repairing things in the affected environments since the invalid lastUsed value was still in the local cache(s) and ended up being synchronized back to the registry.

Modifications

- Add some logic to the janitor processing to detect and update cache entries with a Long.MAX_VALUE lastUsed value
   - This requires a new forceSetLastUsedTime() method in ConcurrentLinkedHashMap since it's otherwise not possible to decrease the current value
- Never sync this value back to the registry
- Avoid creating failure placeholder records with a Long.MAX_VALUE lastUsed value

Result

Avoid "stuck" models due to incorrect lastUsed time. We still haven't determined the root cause (which might already be fixed) but will be able to see whether the issue arises again once this fix is deployed.

Signed-off-by: Nick Hill <[email protected]>
  • Loading branch information
njhill committed Jun 28, 2022
1 parent 4dbc34b commit 95211ca
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 19 deletions.
53 changes: 35 additions & 18 deletions src/main/java/com/ibm/watson/modelmesh/ModelMesh.java
Original file line number Diff line number Diff line change
Expand Up @@ -3633,8 +3633,8 @@ protected Object invokeModel(final String modelId, final Method method, final Me
KVSTORE_LOAD_FAILURE, e);
CacheEntry<?> failedEntry = new CacheEntry<>(modelId, mr, mle);
cacheEntry = unloadManager != null
? unloadManager.insertFailedPlaceholderEntry(modelId, failedEntry, Long.MAX_VALUE)
: runtimeCache.putIfAbsent(modelId, failedEntry, Long.MAX_VALUE);
? unloadManager.insertFailedPlaceholderEntry(modelId, failedEntry, mr.getLastUsed())
: runtimeCache.putIfAbsent(modelId, failedEntry, mr.getLastUsed());
if (cacheEntry == null) {
throw mle;
}
Expand Down Expand Up @@ -5760,10 +5760,23 @@ public void run() {
+ " lastUsed is " + lastUsed + " > " + lastLastUsed);
}
lastLastUsed = lastUsed;

ModelRecord mr = registry.get(modelId);

if (lastUsed == Long.MAX_VALUE) {
// Cache entries here should not have a Long.MAX_VALUE lastUsed value but this has been observed
// in some environments. Repair/log here so that the entry is not pinned in the cache.
if (runtimeCache.forceSetLastUsedTime(modelId, now - (LASTUSED_AGE_ON_ADD_MS * 3L))) {
logger.warn("Force-updated unexpected Long.MAX_VALUE lastUsed time in cache for model " + modelId);
// Also update in registry if needed
repairLastUsedTimeIfNeeded(modelId, mr);
}
return;
}

// skip if new or recently used
final boolean newOrUsedRecently = now - lastUsed < LOCAL_JANITOR_FREQ_SECS * 2000L
+ loadTimeoutMs;
ModelRecord mr = registry.get(modelId);
if (newOrUsedRecently) {
if (mr != null) {
updateLastUsedTimeInRegistryIfStale(modelId, mr, lastUsed);
Expand Down Expand Up @@ -5979,6 +5992,10 @@ public void run() {
// ModelRecord instance should not be used after calling this meth (might be stale)
private void updateLastUsedTimeInRegistryIfStale(String modelId, ModelRecord mr, long lastUsed)
throws Exception {
if (lastUsed == Long.MAX_VALUE) {
// This can be the case for certain failure placeholder records
return;
}
// only attempt update of model record lastUsed time if it's more than
// minStaleAge out of date.
long recLastUsed = mr.getLastUsed();
Expand Down Expand Up @@ -6626,21 +6643,6 @@ void triggerProactiveLoadsForInstanceSubset(ClusterStats stats,
}
}

private void repairLastUsedTimeIfNeeded(String modelId, ModelRecord mr) throws Exception {
// Some ModelRecords records were observed in the registry with an invalid lastUsed
// time of Long.MAX_VALUE, effectively pinning them at the front of the cache and
// causing them to remain loaded even if not used.
// It's not clear whether the bug causing this has yet been fixed - for now we repair
// the entries and can monitor logs for the warning message below.
if (mr.getLastUsed() == Long.MAX_VALUE) {
mr.setLastUsed(currentTimeMillis() - (LASTUSED_AGE_ON_ADD_MS * 3L));
if (registry.conditionalSet(modelId, mr)) {
logger.warn("Repaired Long.MAX_VALUE lastUsed time in registry for model " + modelId);
}
// Don't worry if this fails, will try again next time around
}
}

/**
* @return number of missing instances which were removed from the map
*/
Expand Down Expand Up @@ -6729,6 +6731,21 @@ private void cleanLeaselessEtcdInstanceRecords() {
}
}

final void repairLastUsedTimeIfNeeded(String modelId, ModelRecord mr) throws Exception {
// Some ModelRecords records were observed in the registry with an invalid lastUsed
// time of Long.MAX_VALUE, effectively pinning them at the front of the cache and
// causing them to remain loaded even if not used.
// It's not clear whether the bug causing this has yet been fixed - for now we repair
// the entries and can monitor logs for the warning message below.
if (mr != null && mr.getLastUsed() == Long.MAX_VALUE) {
mr.setLastUsed(currentTimeMillis() - (LASTUSED_AGE_ON_ADD_MS * 3L));
if (registry.conditionalSet(modelId, mr)) {
logger.warn("Repaired Long.MAX_VALUE lastUsed time in registry for model " + modelId);
}
// Don't worry if this fails, will try again next time around
}
}

private void logModelCountMetrics() {
if (!metrics.isEnabled()) {
return;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,23 @@ public long getLastUsedTime(Object key) {
return lut <= 0L ? -1L : lut;
}

/**
* Not intended for general use
*/
public boolean forceSetLastUsedTime(Object key, long lastUsed) {
final Node<K, V> node = data.get(key);
if (node == null) {
return false;
}
evictionLock.lock();
try {
node.lastUsed = lastUsed;
} finally {
evictionLock.unlock();
}
return true;
}

/**
* @return entry's weight or -1 if not found
*/
Expand Down Expand Up @@ -1225,7 +1242,7 @@ Map<K, V> orderedMap(boolean ascending, int limit, long usedSinceOrBefore) {
: evictionDeque.descendingIterator();
while (iterator.hasNext() && (limit > map.size())) {
Node<K, V> node = iterator.next();
long lastUsed = node.lastUsed;
long lastUsed = node.getLastUsed();
if (lastUsed > 0L && (ascending ? lastUsed > usedSinceOrBefore
: lastUsed < usedSinceOrBefore)) {
break;
Expand Down

0 comments on commit 95211ca

Please sign in to comment.