Commit 45e53fd0 authored by Michael Ritter's avatar Michael Ritter

#50 Support for introspection into Token Ingestion

parent cf47ae01
......@@ -5,18 +5,11 @@
package edu.umiacs.ace.monitor.register;
import edu.umiacs.ace.monitor.settings.SettingsConstants;
import edu.umiacs.ace.monitor.settings.SettingsParameter;
import edu.umiacs.ace.monitor.settings.SettingsUtil;
import edu.umiacs.ace.util.PersistUtil;
import org.apache.log4j.NDC;
import javax.persistence.EntityManager;
import javax.servlet.ServletContextEvent;
import javax.servlet.ServletContextListener;
import static edu.umiacs.ace.monitor.settings.SettingsConstants.PARAM_INGEST;
/**
*
* @author shake
......@@ -25,10 +18,6 @@ public class IngestContextListener implements ServletContextListener {
public void contextInitialized(ServletContextEvent sce) {
NDC.push("[Ingest startup]");
EntityManager em = PersistUtil.getEntityManager();
SettingsParameter ingestSettings = SettingsUtil.getOrDefault(PARAM_INGEST,
SettingsConstants.maxIngestThreads, em);
IngestThreadPool.setMaxThreads(Integer.parseInt(ingestSettings.getValue()));
NDC.pop();
}
......
......@@ -34,18 +34,17 @@ import edu.umiacs.ace.monitor.core.Collection;
import edu.umiacs.ace.monitor.core.MonitoredItem;
import edu.umiacs.ace.util.PersistUtil;
import edu.umiacs.util.Strings;
import org.apache.log4j.Logger;
import javax.persistence.EntityManager;
import javax.persistence.EntityTransaction;
import javax.persistence.NoResultException;
import javax.persistence.Query;
import java.util.Date;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.concurrent.RecursiveAction;
import javax.persistence.EntityManager;
import javax.persistence.EntityTransaction;
import javax.persistence.NoResultException;
import javax.persistence.Query;
/**
* Class to register directories from a token store
......@@ -53,14 +52,13 @@ import javax.persistence.Query;
* @author shake
*/
public class IngestDirectory extends RecursiveAction {
private static final Logger LOG = Logger.getLogger(IngestDirectory.class);
private Collection coll;
private Set<String> identifiers;
private Set<String> existingParents = new HashSet<String>();
private Set<String> existingParents = new HashSet<>();
private EntityManager em = PersistUtil.getEntityManager();
private int numTransactions = 0;
public IngestDirectory(Set<String> identifiers , Collection coll){
public IngestDirectory(Set<String> identifiers, Collection coll) {
this.identifiers = identifiers;
this.coll = coll;
}
......@@ -68,13 +66,13 @@ public class IngestDirectory extends RecursiveAction {
@Override
protected void compute() {
// We want this to remain single threaded, so we just leave it be
if ( identifiers == null || coll == null ) {
if (identifiers == null || coll == null) {
return;
}
EntityTransaction trans = em.getTransaction();
trans.begin();
for ( String identifier : identifiers ) {
for (String identifier : identifiers) {
extractAndRegisterParentDirs(identifier);
}
trans.commit();
......@@ -82,31 +80,30 @@ public class IngestDirectory extends RecursiveAction {
private void extractAndRegisterParentDirs(String path) {
// We don't have a FileBean, so build the pathList ourselves
int index;
List<String> pathList = new LinkedList<>();
StringBuilder fullPath = new StringBuilder(path);
List <String> pathList = new LinkedList<String>();
if ( fullPath.charAt(0) != '/' ) {
if (fullPath.charAt(0) != '/') {
fullPath.insert(0, "/");
}
int index = 0;
while( (index = fullPath.lastIndexOf("/")) != 0 ) {
while ((index = fullPath.lastIndexOf("/")) != 0) {
pathList.add(fullPath.toString());
fullPath.delete(index, fullPath.length());
}
pathList.add(fullPath.toString());
// Same as AuditThread, but with our pathList
String parentName = (pathList.size() > 1
? pathList.get(1) : null);
String parentName = pathList.size() > 1 ? pathList.get(1) : null;
// 1. make sure directory path is registered
if (parentName != null) {
//parentName = Strings.cleanStringForXml(parentName, '_');
for ( int i = 1; i < pathList.size(); i++) {
String parent = (pathList.size() > i + 1 ? pathList.get(i+1) : null);
for (int i = 1; i < pathList.size(); i++) {
String parent = (pathList.size() > i + 1 ? pathList.get(i + 1) : null);
parent = Strings.cleanStringForXml(parent, '_');
createDirectory(pathList.get(i), parent);
if ( numTransactions % 30 == 0 ) {
if (numTransactions % 30 == 0) {
em.flush();
em.clear();
}
......@@ -116,10 +113,10 @@ public class IngestDirectory extends RecursiveAction {
private void createDirectory(String directory, String root) {
MonitoredItem mi;
if ( existingParents.contains(directory) || directory == null ) {
if (existingParents.contains(directory) || directory == null) {
return;
}
if ( (mi = getItemByPath(directory)) != null ) {
if ((mi = getItemByPath(directory)) != null) {
Date d = new Date();
mi.setLastSeen(d);
mi.setLastVisited(d);
......@@ -133,13 +130,13 @@ public class IngestDirectory extends RecursiveAction {
}
public MonitoredItem getItemByPath( String path ) {
public MonitoredItem getItemByPath(String path) {
Query q = em.createNamedQuery("MonitoredItem.getItemByPath");
q.setParameter("path", path);
q.setParameter("coll", coll);
try {
return (MonitoredItem) q.getSingleResult();
} catch ( NoResultException ex ) {
} catch (NoResultException ex) {
return null;
}
......@@ -161,9 +158,9 @@ public class IngestDirectory extends RecursiveAction {
mi.setPath(path);
mi.setState(initialState);
mi.setSize(size);
em.persist(mi);
numTransactions++;
return mi;
}
......
......@@ -5,57 +5,97 @@ import edu.umiacs.ace.monitor.core.Token;
import org.apache.log4j.Logger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.ForkJoinTask;
/**
* A private class to supervise token ingestion. We use it to keep track of
* what collections we have seen
*
* @author shake
*/
public class IngestSupervisor implements Runnable {
private static final Logger LOG = Logger.getLogger(IngestSupervisor.class);
private static Map<Collection, Set<String>> hasSeen = new HashMap<>();
private final Map<String, Token> tokens;
private final Collection coll;
private final ForkJoinPool pool;
// I wonder if we should combine these and have some type of work unit to encapsulate
// Token, IngestState
private final Map<String, Token> tokens;
private ConcurrentMap<IngestState, ConcurrentSkipListSet<String>> states;
public IngestSupervisor(final Map<String, Token> tokens, final Collection coll) {
this.tokens = tokens;
this.coll = coll;
this.pool = new ForkJoinPool();
this.states = new ConcurrentHashMap<>();
// so we don't have to worry about npes
states.put(IngestState.NEW, new ConcurrentSkipListSet<>());
states.put(IngestState.MATCH, new ConcurrentSkipListSet<>());
states.put(IngestState.QUEUED, new ConcurrentSkipListSet<>());
states.put(IngestState.UPDATED, new ConcurrentSkipListSet<>());
}
public void run() {
LOG.info("Starting Supervisor");
ForkJoinTask dirTask = pool.submit(new IngestDirectory(tokens.keySet(), coll));
// Remove any tokens we've already seen and can possibly be in progress
// Possibly release tokens after the thread has finished merging them
/*
Set<String> tokensSeen = hasSeen.get(coll);
if (tokensSeen == null) {
tokensSeen = new HashSet<>();
tokensSeen.addAll(tokens.keySet());
} else {
tokens.keySet().removeAll(hasSeen.get(coll));
tokensSeen.addAll(tokens.keySet());
}
hasSeen.put(coll, tokensSeen);
*/
ConcurrentSkipListSet<String> queued = states.get(IngestState.QUEUED);
queued.addAll(tokens.keySet());
ForkJoinTask dirTask = pool.submit(new IngestDirectory(tokens.keySet(), coll));
// Split the token store we're given up equally among our threads
// and submit jobs to the thread pool
List<String> keyList = new ArrayList<>(tokens.keySet());
ForkJoinTask fileTask = pool.submit(new IngestThread(tokens, coll, keyList));
ForkJoinTask fileTask = pool.submit(new IngestThread(tokens, coll, keyList, states));
dirTask.quietlyJoin();
fileTask.quietlyJoin();
pool.shutdown();
LOG.info("Leaving Supervisor");
}
public ConcurrentMap<IngestState, ConcurrentSkipListSet<String>> getState() {
return states;
}
// jsp helpers
public int getQueuedSize() {
return states.get(IngestState.QUEUED).size();
}
public Set<String> getQueued() {
return states.get(IngestState.QUEUED);
}
public int getNewSize() {
return states.get(IngestState.NEW).size();
}
public Set<String> getNewItems() {
return states.get(IngestState.NEW);
}
public int getUpdatedSize() {
return states.get(IngestState.UPDATED).size();
}
public Set<String> getUpdated() {
return states.get(IngestState.UPDATED);
}
public int getMatchSize() {
return states.get(IngestState.MATCH).size();
}
public Set<String> getMatched() {
return states.get(IngestState.MATCH);
}
}
......@@ -30,18 +30,21 @@
package edu.umiacs.ace.monitor.register;
import com.github.benmanes.caffeine.cache.Cache;
import com.github.benmanes.caffeine.cache.Caffeine;
import edu.umiacs.ace.monitor.core.Collection;
import edu.umiacs.ace.monitor.core.Token;
import edu.umiacs.ace.monitor.settings.SettingsConstants;
import edu.umiacs.ace.token.TokenStoreEntry;
import edu.umiacs.ace.token.TokenStoreReader;
import edu.umiacs.ace.util.CollectionThreadPoolExecutor;
import edu.umiacs.ace.util.KSFuture;
import edu.umiacs.ace.util.TokenUtil;
import org.apache.log4j.Logger;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.TimeUnit;
/**
* TODO - Possibly make a LinkedBlockingQueue that threads can poll from
......@@ -56,9 +59,7 @@ public class IngestThreadPool {
private static final IngestThreadPool instance = new IngestThreadPool();
private static final Logger LOG = Logger.getLogger(IngestThreadPool.class);
private static Map<Collection, Set<String>> hasSeen;
private static int maxThreads =
Integer.parseInt(SettingsConstants.maxIngestThreads);
private static Cache<Collection, KSFuture<IngestSupervisor>> cache;
private IngestThreadPool() {
}
......@@ -67,8 +68,11 @@ public class IngestThreadPool {
// We instantiate here to ensure 2 things:
// 1 - We use the correct number for max threads from the DB
// 2 - Java will throw an exception otherwise
if (hasSeen == null) {
hasSeen = new HashMap<>();
if (cache == null) {
// WeakReference instead?
cache = Caffeine.newBuilder()
.expireAfterAccess(10, TimeUnit.MINUTES)
.build();
}
return instance;
......@@ -85,7 +89,7 @@ public class IngestThreadPool {
throw new RuntimeException("Token file is corrupt");
}
IngestThreadPool thePool = IngestThreadPool.getInstance();
HashMap<String, Token> batchTokens = new HashMap<String, Token>();
HashMap<String, Token> batchTokens = new HashMap<>();
while (tokenReader.hasNext()) {
TokenStoreEntry tokenEntry = tokenReader.next();
......@@ -113,35 +117,22 @@ public class IngestThreadPool {
// We may want to save the Future so that we can display information
// about the current ingestion
executor.submitIngestThread(coll, new IngestSupervisor(tokens, coll));
KSFuture<IngestSupervisor> future =
executor.submitIngestThread(coll, new IngestSupervisor(tokens, coll));
cache.put(coll, future);
}
// TODO: Figure out how to display this... probably will move to the status display like audits
public String getStatus() {
return String.format("[Thread Pool] Active"); /*: %d, Completed: %d, Total: %d",
executor.getActiveCount(),
executor.getCompletedTaskCount(),
executor.getTaskCount());*/
}
// Not entirely accurate for the jsp, but it'll show what collections are
// ingesting what
public Map<Collection, Set<String>> getIngestedItems() {
return hasSeen;
public ConcurrentMap<Collection, KSFuture<IngestSupervisor>> getCache() {
return cache.asMap();
}
public static void setMaxThreads(int maxThreads) {
IngestThreadPool.maxThreads = maxThreads;
public String getStatus() {
return "[Thread Pool] Active";
}
protected static void shutdownPools() {
LOG.debug("[Ingest] Shutting down thread pools.");
/*
executor.shutdown();
if (!executor.isTerminated()) {
executor.shutdownNow();
}
*/
cache.invalidateAll();
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment