package org.semanticdesktop.aperture.extractor.plaintext;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackInputStream;
import java.io.PushbackReader;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import org.ontoware.rdf2go.model.node.URI;
import org.ontoware.rdf2go.vocabulary.RDF;
import org.semanticdesktop.aperture.extractor.Extractor;
import org.semanticdesktop.aperture.extractor.ExtractorException;
import org.semanticdesktop.aperture.rdf.RDFContainer;
import org.semanticdesktop.aperture.util.IOUtil;
import org.semanticdesktop.aperture.util.UtfUtil;
import org.semanticdesktop.aperture.vocabulary.NFO;
import org.semanticdesktop.aperture.vocabulary.NIE;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:WEB-INF/lib/aperture-1.2.0.jar:org/semanticdesktop/aperture/extractor/plaintext/PlainTextExtractor.class */
public class PlainTextExtractor implements Extractor {
    private static final int STRING_TEST_LENGTH = 256;
    private Logger logger = LoggerFactory.getLogger(getClass());

    @Override // org.semanticdesktop.aperture.extractor.Extractor
    public void extract(URI uri, InputStream inputStream, Charset charset, String str, RDFContainer rDFContainer) throws ExtractorException {
        try {
            PushbackInputStream pushbackInputStream = new PushbackInputStream(inputStream, 4);
            byte[] readBytes = IOUtil.readBytes(pushbackInputStream, 4);
            byte[] findMatchingBOM = UtfUtil.findMatchingBOM(readBytes);
            if (findMatchingBOM == null) {
                pushbackInputStream.unread(readBytes);
            } else {
                pushbackInputStream.unread(readBytes, findMatchingBOM.length, readBytes.length - findMatchingBOM.length);
                String charsetName = UtfUtil.getCharsetName(findMatchingBOM);
                if (charsetName != null) {
                    try {
                        charset = Charset.forName(charsetName);
                    } catch (UnsupportedCharsetException e) {
                        this.logger.info("Unsupported charset, trying to continue with current charset", (Throwable) e);
                    }
                }
            }
            PushbackReader pushbackReader = new PushbackReader(charset == null ? new InputStreamReader(pushbackInputStream) : new InputStreamReader(pushbackInputStream, charset), 256);
            String readString = IOUtil.readString(pushbackReader, 256);
            int length = readString.length();
            for (int i = 0; i < length; i++) {
                char charAt = readString.charAt(i);
                if (!Character.isDefined(charAt) || (Character.isISOControl(charAt) && !Character.isWhitespace(charAt))) {
                    this.logger.warn("Document does not contain plain text");
                    return;
                }
            }
            pushbackReader.unread(readString.toCharArray());
            String readString2 = IOUtil.readString(pushbackReader);
            if (readString2.length() > 0) {
                rDFContainer.add(RDF.type, NFO.PlainTextDocument);
                rDFContainer.add(NIE.plainTextContent, readString2);
            }
        } catch (IOException e2) {
            throw new ExtractorException(e2);
        }
    }
}
