Bad SMILES are marked with an empty molecule and property. This mirro…

…rs behaviour of the IteratingSMILESReader.
cdk · Aug 2, 2015 · 6ded684 · 6ded684
1 parent 8afd272
commit 6ded684
Show file tree

Hide file tree

Showing 2 changed files with 68 additions and 24 deletions.
diff --git a/storage/smiles/src/main/java/org/openscience/cdk/io/SMILESReader.java b/storage/smiles/src/main/java/org/openscience/cdk/io/SMILESReader.java
@@ -23,13 +23,7 @@
  */
 package org.openscience.cdk.io;
 
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.io.StringReader;
-
+import org.openscience.cdk.CDKConstants;
 import org.openscience.cdk.exception.CDKException;
 import org.openscience.cdk.interfaces.IAtomContainer;
 import org.openscience.cdk.interfaces.IAtomContainerSet;
@@ -39,18 +33,30 @@
 import org.openscience.cdk.interfaces.IChemSequence;
 import org.openscience.cdk.io.formats.IResourceFormat;
 import org.openscience.cdk.io.formats.SMILESFormat;
+import org.openscience.cdk.io.iterator.IteratingSMILESReader;
 import org.openscience.cdk.smiles.SmilesParser;
 import org.openscience.cdk.tools.ILoggingTool;
 import org.openscience.cdk.tools.LoggingToolFactory;
 
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.StringReader;
+
 /**
  * This Reader reads files which has one SMILES string on each
  * line, where the format is given as below:
  * <pre>
  * COC ethoxy ethane
  * </pre>
  * Thus first the SMILES, and then after the first space (or tab) on the line a title
- * that is stored as "SMIdbNAME" property in the Molecule.
+ * that is stored as {@link CDKConstants#TITLE}. For legacy comparability the
+ * title is also placed in a "SMIdbNAME" property. If a line is invalid an empty
+ * molecule is inserted into the container set. The molecule with have the prop
+ * {@link IteratingSMILESReader#BAD_SMILES_INPUT} set to the input line that
+ * could not be read. 
  *
  * <p>For each line a molecule is generated, and multiple Molecules are
  * read as MoleculeSet.
@@ -160,25 +166,20 @@ private IAtomContainerSet readAtomContainerSet(IAtomContainerSet som) {
             while (line != null) {
                 logger.debug("Line: ", line);
 
-                String[] tokens = line.split("[\\s\\t]+", 2);
-                if (tokens.length > 2) throw new Exception("Malformed line");
-
-                String SMILES = tokens[0];
-                String name = null;
-                if (tokens.length == 2) name = tokens[1];
-
-                logger.debug("Line contains SMILES and name: ", SMILES, " + ", name);
+                final String name = suffix(line);
 
                 try {
-                    IAtomContainer molecule = sp.parseSmiles(SMILES);
+                    IAtomContainer molecule = sp.parseSmiles(line);
+                    molecule.setProperty("SMIdbNAME", name);
+                    molecule.setProperty(CDKConstants.TITLE, name);
                     som.addAtomContainer(molecule);
-                    if (name != null) {
-                        molecule.setProperty("SMIdbNAME", name);
-                    }
-                } catch (Exception exception) {
-                    logger.warn("This SMILES could not be parsed: ", SMILES);
+                } catch (CDKException exception) {
+                    logger.warn("This SMILES could not be parsed: ", line);
                     logger.warn("Because of: ", exception.getMessage());
                     logger.debug(exception);
+                    IAtomContainer empty = som.getBuilder().newInstance(IAtomContainer.class, 0, 0, 0, 0);
+                    empty.setProperty(IteratingSMILESReader.BAD_SMILES_INPUT, line);
+                    som.addAtomContainer(empty);
                 }
                 if (input.ready()) {
                     line = input.readLine();
@@ -197,4 +198,19 @@ private IAtomContainerSet readAtomContainerSet(IAtomContainerSet som) {
     public void close() throws IOException {
         input.close();
     }
+
+    /**
+     * Obtain the suffix after a line containing SMILES. The suffix follows
+     * any ' ' or '\t' termination characters.
+     *
+     * @param line input line
+     * @return the suffix - or an empty line
+     */
+    private String suffix(final String line) {
+        for (int i = 0; i < line.length(); i++) {
+            char c = line.charAt(i);
+            if (c == ' ' || c == '\t') return line.substring(i + 1);
+        }
+        return "";
+    }
 }
diff --git a/storage/smiles/src/test/java/org/openscience/cdk/io/SMILESReaderTest.java b/storage/smiles/src/test/java/org/openscience/cdk/io/SMILESReaderTest.java
@@ -22,18 +22,28 @@
  *  */
 package org.openscience.cdk.io;
 
-import java.io.InputStream;
-
 import org.junit.Assert;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.openscience.cdk.AtomContainerSet;
 import org.openscience.cdk.ChemFile;
+import org.openscience.cdk.exception.CDKException;
 import org.openscience.cdk.interfaces.IAtomContainer;
 import org.openscience.cdk.interfaces.IAtomContainerSet;
+import org.openscience.cdk.interfaces.IChemObjectBuilder;
+import org.openscience.cdk.io.iterator.IteratingSMILESReader;
+import org.openscience.cdk.silent.SilentChemObjectBuilder;
 import org.openscience.cdk.tools.ILoggingTool;
 import org.openscience.cdk.tools.LoggingToolFactory;
 
+import java.io.InputStream;
+import java.io.StringReader;
+
+import static org.hamcrest.CoreMatchers.is;
+import static org.hamcrest.CoreMatchers.notNullValue;
+import static org.hamcrest.CoreMatchers.nullValue;
+import static org.hamcrest.MatcherAssert.assertThat;
+
 /**
  * TestCase for the reading MDL mol files using one test file.
  *
@@ -100,5 +110,23 @@ public void testReadingSmiFile_3() throws Exception {
         IAtomContainerSet som = reader.read(new AtomContainerSet());
         Assert.assertEquals(5, som.getAtomContainerCount());
     }
+
+    @Test 
+    public void badSmilesLine() throws CDKException {
+        IChemObjectBuilder bldr = SilentChemObjectBuilder.getInstance();
+        String input = "C\nn1cccc1\nc1ccccc1\n";
+        DefaultChemObjectReader cor = new SMILESReader(new StringReader(input));
+        IAtomContainerSet mols = cor.read(bldr.newInstance(IAtomContainerSet.class));
+        assertThat(mols.getAtomContainerCount(), is(3));
+        assertThat(mols.getAtomContainer(0).getAtomCount(), is(1));
+        assertThat(mols.getAtomContainer(0).getProperty(IteratingSMILESReader.BAD_SMILES_INPUT),
+                   nullValue());
+        assertThat(mols.getAtomContainer(1).getAtomCount(), is(0));
+        assertThat(mols.getAtomContainer(1).getProperty(IteratingSMILESReader.BAD_SMILES_INPUT),
+                   notNullValue());
+        assertThat(mols.getAtomContainer(2).getAtomCount(), is(6));
+        assertThat(mols.getAtomContainer(2).getProperty(IteratingSMILESReader.BAD_SMILES_INPUT),
+                   nullValue());
+    }
 
 }