Skip to content

Commit

Permalink
Merge pull request #8 from fcrepo4-labs/legacy-fs-support
Browse files Browse the repository at this point in the history
Added support for direct migration from legacy-fs.
  • Loading branch information
mikedurbin committed May 8, 2015
2 parents 02b57af + 2963e81 commit 17ed883
Show file tree
Hide file tree
Showing 16 changed files with 484 additions and 41 deletions.
49 changes: 49 additions & 0 deletions src/main/java/org/fcrepo/migration/foxml11/AkubraFSIDResolver.java
@@ -0,0 +1,49 @@
package org.fcrepo.migration.foxml11;

import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;

/**
* An extension of DirectoryScanningIDResolver for datastream directories of fedora
* repositories using the akubra-fs storage implementation.
*
* @author mdurbin
*/
public class AkubraFSIDResolver extends DirectoryScanningIDResolver {

/**
* Basic constructor.
* @param indexDir A directory that will serve as a lucene index directory to cache ID resolution.
* @param dsRoot the root directory of the AkubraFS datastream store.
*/
public AkubraFSIDResolver(final File indexDir, final File dsRoot) throws IOException {
super(indexDir, dsRoot);
}

/**
* Basic constructor.
* @param dsRoot the root directory of the AkubraFS datastream store.
*/
public AkubraFSIDResolver(final File dsRoot) throws IOException {
super(null, dsRoot);
}

@Override
protected String getInternalIdForFile(final File f) {
String id = f.getName();
try {
id = URLDecoder.decode(id, "UTF-8");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
if (!id.startsWith("info:fedora/")) {
throw new IllegalArgumentException(f.getName()
+ " does not appear to be a valid akubraFS datastream file!");
}
id = id.substring("info:fedora/".length());
id = id.replace('/', '+');
return id;
}
}
@@ -1,11 +1,6 @@
package org.fcrepo.migration.foxml11;

import static org.slf4j.LoggerFactory.getLogger;

import java.io.File;
import java.io.IOException;
import java.net.URLDecoder;

import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
Expand All @@ -24,6 +19,11 @@
import org.apache.lucene.util.Version;
import org.slf4j.Logger;

import java.io.File;
import java.io.IOException;

import static org.slf4j.LoggerFactory.getLogger;

/**
* An InternalIDResolver implementation that generates an index of
* datastream ids (filenames) to file paths for the contents of a
Expand All @@ -33,19 +33,51 @@
* id for that datastream version.
* @author mdurbin
*/
public class DirectoryScanningIDResolver implements InternalIDResolver {
public abstract class DirectoryScanningIDResolver implements InternalIDResolver {

private static final Logger LOGGER = getLogger(InternalIDResolver.class);

/**
* A lucene IndexSearcher over an index maintained by this class.
* For every file found in the datastream directory a document exists
* in this index that contains an "id" field and a "path" field. The
* id field is the internal id, the path field is the full path to the
* file containing that datastream content.
*/
private IndexSearcher searcher;

/**
* directory scanning ID resolver
* @param indexDir the index directory
* @param cachedIndexDir the index directory. If it exists, the old cache will be used, if it doesn't a new
* cache will be built at that location. If it is null, a new cache will be built in
* the temp file space that will be deleted upon application shutdown.
* @param dsRoot the datastream root
* @throws IOException IO exception
*/
public DirectoryScanningIDResolver(final File indexDir, final File dsRoot) throws IOException {
public DirectoryScanningIDResolver(final File cachedIndexDir, final File dsRoot) throws IOException {
final File indexDir;
if (cachedIndexDir == null) {
final File temp = File.createTempFile("tempfile", "basedir");
temp.delete();
temp.mkdir();
indexDir = new File(temp, "index");
LOGGER.info("No index directory specified. Creating temporary index at \""
+ indexDir.getAbsolutePath() + "\".");
Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {
@Override
public void run() {
try {
LOGGER.info("Deleting index directory at \"" + indexDir.getAbsolutePath() + "\"...");
FileUtils.deleteDirectory(indexDir);
} catch (IOException e) {
LOGGER.error("Unable to delete index directory at \"" + indexDir.getAbsolutePath() + "\"!", e);
e.printStackTrace();
}
}
}));
} else {
indexDir = cachedIndexDir;
}
final Directory dir = FSDirectory.open(indexDir);
if (indexDir.exists()) {
LOGGER.warn("Index exists at \"" + indexDir.getPath() + "\" and will be used. "
Expand All @@ -69,8 +101,7 @@ public DirectoryScanningIDResolver(final File indexDir, final File dsRoot) throw
@Override
public CachedContent resolveInternalID(final String id) {
try {
final TopDocs result = searcher.search(new TermQuery(new Term("file", "info:fedora/"
+ id.replace('+', '/'))), 2);
final TopDocs result = searcher.search(new TermQuery(new Term("id", id)), 2);
if (result.totalHits == 1) {
return new FileCachedContent(new File(searcher.doc(result.scoreDocs[0].doc).get("path")));
} else if (result.totalHits < 1) {
Expand All @@ -93,9 +124,15 @@ private void indexDatastreams(final IndexWriter writer, final File f) throws IOE
} else {
final Document doc = new Document();
doc.add(new StringField("path", f.getPath(), Field.Store.YES));
doc.add(new StringField("file", URLDecoder.decode(f.getName(), "UTF-8"), Field.Store.NO));
doc.add(new StringField("id", getInternalIdForFile(f), Field.Store.YES));
LOGGER.trace("Added \"" + getInternalIdForFile(f) + "\"");
writer.addDocument(doc);
}
}

/**
* Determines the internal id for the given file.
*/
protected abstract String getInternalIdForFile(File f);

}
Expand Up @@ -8,9 +8,9 @@
public interface InternalIDResolver {

/**
* resolve internal ID.
* @param id the internal ID
* @return the cached content
* Gets the datastream for an internal ID.
* @param id the internal id referenced within a FOXML file.
* @return the binary content for the datastream referenced by the internal id
*/
public CachedContent resolveInternalID(String id);
}
35 changes: 35 additions & 0 deletions src/main/java/org/fcrepo/migration/foxml11/LegacyFSIDResolver.java
@@ -0,0 +1,35 @@
package org.fcrepo.migration.foxml11;

import java.io.File;
import java.io.IOException;

/**
* An extension of DirectoryScanningIDResolver for datastream directories of fedora
* repositories using the legacy-fs storage implementation.
*
* @author mdurbin
*/
public class LegacyFSIDResolver extends DirectoryScanningIDResolver {

/**
* Basic constructor.
* @param indexDir A directory that will serve as a lucene index directory to cache ID resolution.
* @param dsRoot the root directory of the AkubraFS datastream store.
*/
public LegacyFSIDResolver(final File indexDir, final File dsRoot) throws IOException {
super(indexDir, dsRoot);
}

/**
* Basic constructor.
* @param dsRoot the root directory of the AkubraFS datastream store.
*/
public LegacyFSIDResolver(final File dsRoot) throws IOException {
super(null, dsRoot);
}

@Override
protected String getInternalIdForFile(final File f) {
return f.getName().replaceFirst("_", ":");
}
}
Expand Up @@ -487,7 +487,7 @@ protected void updateResourceProperties(final FedoraResource resource,
updateRequest.add(new UpdateDataInsert(triplesToInsert));
final ByteArrayOutputStream sparqlUpdate = new ByteArrayOutputStream();
updateRequest.output(new IndentedWriter(sparqlUpdate));
//LOGGER.debug("SPARQL: " + sparqlUpdate.toString("UTF-8"));
LOGGER.trace("SPARQL: " + sparqlUpdate.toString("UTF-8"));
resource.updateProperties(sparqlUpdate.toString("UTF-8"));
suffix = 0;
} catch (final FedoraException e) {
Expand Down
34 changes: 26 additions & 8 deletions src/main/resources/spring/migration-bean.xml
Expand Up @@ -13,7 +13,6 @@
migration scenario. (note, unless you reconfigute it, it will write
content to localhost:8080/rest)
<property name="handler" ref="objectAbstraction" />
-->
<!-- The following is a convenience option for testing that allows you to
point to a very large directory, but only ingest as many records as
Expand All @@ -28,7 +27,7 @@

<bean id="nativeFoxmlDirectoryObjectSource" class="org.fcrepo.migration.foxml11.NativeFoxmlDirectoryObjectSource" >
<constructor-arg name="objectStore" ref="objectStore" />
<constructor-arg name="resolver" ref="directoryScanningIDResolver" />
<constructor-arg name="resolver" ref="akubraIDResolver" />
<property name="fetcher" ref="httpClientURLFetcher"/>
</bean>

Expand Down Expand Up @@ -101,12 +100,31 @@
<constructor-arg name="repositoryURL" value="http://localhost:8080/rest/" />
</bean>

<!-- A utility bean that maintains an index necessary to resolve datastream files from the fedora 3 internal ids -->
<bean id="directoryScanningIDResolver" class="org.fcrepo.migration.foxml11.DirectoryScanningIDResolver">
<!-- A utility bean that maintains an index necessary to resolve datastream files from the fedora 3 internal ids
as encoded for fedora installations using the akubra-fs storage.
-->
<bean id="akubraIDResolver" class="org.fcrepo.migration.foxml11.AkubraFSIDResolver">
<constructor-arg name="dsRoot" type="java.io.File" ref="datastreamStore"/>
<constructor-arg name="indexDir" type="java.io.File" ref="indexRoot" />

<!-- Add the following line back in if you wish to maintain a cache of the internal id mapping between
runs of this application. This will save substantial time at startup, but is not appropriate if
the repository changes. -->
<!--<constructor-arg name="indexDir" type="java.io.File" ref="indexRoot" />-->
</bean>

<!-- A utility bean that maintains an index necessary to resolve datastream files from the fedora 3 internal ids
as encoded for fedora installations using the akubra-fs storage.
-->
<bean id="legacyIDResolver" class="org.fcrepo.migration.foxml11.LegacyFSIDResolver">
<constructor-arg name="dsRoot" type="java.io.File" ref="datastreamStore"/>

<!-- Add the following line back in if you wish to maintain a cache of the internal id mapping between
runs of this application. This will save substantial time at startup, but is not appropriate if
the repository changes. -->
<!--<constructor-arg name="indexDir" type="java.io.File" ref="indexRoot" />-->
</bean>


<!-- A bean that defines the codebase used to make HTTP requests to fetch content at URLs. -->
<bean id="httpClientURLFetcher" class="org.fcrepo.migration.foxml11.HttpClientURLFetcher" />

Expand All @@ -115,19 +133,19 @@
<!-- Local Environment Configuration -->

<bean id="objectStore" class="java.io.File">
<constructor-arg type="java.lang.String" value="src/test/resources/objectStore" />
<constructor-arg type="java.lang.String" value="src/test/resources/akubraFS/objectStore" />
</bean>

<bean id="datastreamStore" class="java.io.File">
<constructor-arg type="java.lang.String" value="src/test/resources/datastreamStore" />
<constructor-arg type="java.lang.String" value="src/test/resources/akubraFS/datastreamStore" />
</bean>

<bean id="exportedFoxmlDir" class="java.io.File">
<constructor-arg type="java.lang.String" value="src/test/resources/exported" />
</bean>

<bean id="indexRoot" class="java.io.File">
<constructor-arg type="java.lang.String" value="target/work/index" />
<constructor-arg type="java.lang.String" value="index" />
</bean>


Expand Down
@@ -0,0 +1,41 @@
package org.fcrepo.migration;

import org.junit.Before;
import org.springframework.context.ConfigurableApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;

import javax.xml.stream.XMLStreamException;
import java.io.IOException;

/**
* @author mdurbin
*/
public class LegacyFoxmlStorageMigratorTest extends Example1TestSuite {

private static DummyHandler result;

private static DummyURLFetcher fetcher;

@Before
public synchronized void processFoxml() throws XMLStreamException, IOException {
if (getResult() == null) {
final ConfigurableApplicationContext context =
new ClassPathXmlApplicationContext("spring/stored-legacy-foxml.xml");
this.result = (DummyHandler) context.getBean("dummyHandler");
this.fetcher = (DummyURLFetcher) context.getBean("dummyFetcher");
final Migrator m = (Migrator) context.getBean("migrator");
m.run();
context.close();
}
}

@Override
protected DummyHandler getResult() {
return result;
}

@Override
protected DummyURLFetcher getFetcher() {
return fetcher;
}
}
@@ -0,0 +1,46 @@
package org.fcrepo.migration.foxml11;

import junit.framework.Assert;
import org.apache.commons.io.FileUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;

/**
* @author mikedurbin
*/
public class AkubraFSIDResolverTest {

private AkubraFSIDResolver idResolver;

private File tempDir;

@Before
public void setup() throws IOException {
tempDir = File.createTempFile("tempfile", "basedir");
tempDir.delete();
tempDir.mkdir();
idResolver = new AkubraFSIDResolver(tempDir);
}

@Test
public void testIDMapping() throws UnsupportedEncodingException {
Assert.assertEquals("example:1+DS2+DS2.0",
idResolver.getInternalIdForFile(new File("info%3Afedora%2Fexample%3A1%2FDS2%2FDS2.0")));
}

@Test (expected = IllegalArgumentException.class)
public void testBadFileIDMapping() throws UnsupportedEncodingException {
idResolver.getInternalIdForFile(new File("example%3A1%2FDS2%2FDS2.0"));
}


@After
public void cleanup() throws IOException {
FileUtils.deleteDirectory(tempDir);
}
}

0 comments on commit 17ed883

Please sign in to comment.