Aperture Framework

Aperture is a Framework that deals with multiples type of documents and formats. It’s is written in java and under heavy development. It tries to deal with all the multiple formats via a unique interface, it’s quite useful for crawling resources on a standard way. From its website,

“Aperture is a Java framework for extracting and querying full-text content and metadata from various information systems (e.g. file systems, web sites, mail boxes) and the file formats (e.g. documents, images) occurring in these systems. “

The following class will let you extract full text from your files and some additional metadata (like the creator, the title and the language),

I found it quite useful and simple at the same time, here you have an example on how it works, (the code is a modification of the extractor example)

import info.aduna.io.IOUtil;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Set;
import org.semanticdesktop.aperture.extractor.Extractor;
import org.semanticdesktop.aperture.extractor.ExtractorException;
import org.semanticdesktop.aperture.extractor.ExtractorFactory;
import org.semanticdesktop.aperture.extractor.ExtractorRegistry;
import org.semanticdesktop.aperture.extractor.impl.DefaultExtractorRegistry;
import org.semanticdesktop.aperture.mime.identifier.MimeTypeIdentifier;
import org.semanticdesktop.aperture.mime.identifier.magic.MagicMimeTypeIdentifier;
import org.semanticdesktop.aperture.rdf.RDFContainer;
import org.semanticdesktop.aperture.rdf.impl.RDFContainerFactoryImpl;
import org.semanticdesktop.aperture.vocabulary.DATA;

public class Normalizer {

	private RDFContainer container;

	public void normalize(String filename) throws IOException, ExtractorException  {
		// create a MimeTypeIdentifier
		MimeTypeIdentifier identifier = new MagicMimeTypeIdentifier();

		// create an ExtractorRegistry containing all Extractors
		ExtractorRegistry extractorRegistry = new DefaultExtractorRegistry();

		// create a stream of the specified file
		File file = new File(filename);
		FileInputStream stream = new FileInputStream(file);

		// read as many bytes of the file as desired by the MIME type identifier
		int minimumArrayLength = identifier.getMinArrayLength();
		int bufferSize = Math.max(minimumArrayLength, 8192);
		BufferedInputStream buffer = new BufferedInputStream(stream, bufferSize);
		buffer.mark(minimumArrayLength + 10); // add some for safety
		byte[] bytes = IOUtil.readBytes(buffer, minimumArrayLength);

		// let the MimeTypeIdentifier determine the MIME type of this file
		String mimeType = identifier.identify(bytes, file.getPath(), null);

		// skip the extraction phase when the MIME type could not be determined
		if (mimeType == null) {
			System.err.println(”WARNING: MIME type could not be established.”);
		} else {
			// create the RDFContainer that will hold the RDF model
			RDFContainerFactoryImpl containerFactory = new RDFContainerFactoryImpl();
			container = containerFactory.newInstance(file.toURI().toString());

			// determine and apply an Extractor that can handle this MIME type
			Set factories = extractorRegistry.get(mimeType);
			if (factories != null && !factories.isEmpty()) {
				// just fetch the first available Extractor
				ExtractorFactory factory = (ExtractorFactory) factories.iterator().next();
				Extractor extractor = factory.get();

				// apply the extractor on the specified file
				buffer.reset();
				extractor.extract(container.getDescribedUri(), buffer, null, mimeType, container);
			}

			// add the MIME type as an additional statement to the RDF model
			container.add(DATA.mimeType, mimeType);
		}
		buffer.close();
	}

	public String getCreator(){
		return container.getString(DATA.creator);
	}

	public String getTitle(){
		return container.getString(DATA.title);
	}

	public String getLanguge(){
		return container.getString(DATA.language);
	}

	public String getFullTetx(){
		return container.getString(DATA.fullText);
	}
}