Skip to content

Commit

Permalink
Bad SMILES are marked with an empty molecule and property. This mirro…
Browse files Browse the repository at this point in the history
…rs behaviour of the IteratingSMILESReader.
  • Loading branch information
johnmay committed Aug 2, 2015
1 parent 8afd272 commit 6ded684
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 24 deletions.
Expand Up @@ -23,13 +23,7 @@
*/
package org.openscience.cdk.io;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;

import org.openscience.cdk.CDKConstants;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.interfaces.IAtomContainerSet;
Expand All @@ -39,18 +33,30 @@
import org.openscience.cdk.interfaces.IChemSequence;
import org.openscience.cdk.io.formats.IResourceFormat;
import org.openscience.cdk.io.formats.SMILESFormat;
import org.openscience.cdk.io.iterator.IteratingSMILESReader;
import org.openscience.cdk.smiles.SmilesParser;
import org.openscience.cdk.tools.ILoggingTool;
import org.openscience.cdk.tools.LoggingToolFactory;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;

/**
* This Reader reads files which has one SMILES string on each
* line, where the format is given as below:
* <pre>
* COC ethoxy ethane
* </pre>
* Thus first the SMILES, and then after the first space (or tab) on the line a title
* that is stored as "SMIdbNAME" property in the Molecule.
* that is stored as {@link CDKConstants#TITLE}. For legacy comparability the
* title is also placed in a "SMIdbNAME" property. If a line is invalid an empty
* molecule is inserted into the container set. The molecule with have the prop
* {@link IteratingSMILESReader#BAD_SMILES_INPUT} set to the input line that
* could not be read.
*
* <p>For each line a molecule is generated, and multiple Molecules are
* read as MoleculeSet.
Expand Down Expand Up @@ -160,25 +166,20 @@ private IAtomContainerSet readAtomContainerSet(IAtomContainerSet som) {
while (line != null) {
logger.debug("Line: ", line);

String[] tokens = line.split("[\\s\\t]+", 2);
if (tokens.length > 2) throw new Exception("Malformed line");

String SMILES = tokens[0];
String name = null;
if (tokens.length == 2) name = tokens[1];

logger.debug("Line contains SMILES and name: ", SMILES, " + ", name);
final String name = suffix(line);

try {
IAtomContainer molecule = sp.parseSmiles(SMILES);
IAtomContainer molecule = sp.parseSmiles(line);
molecule.setProperty("SMIdbNAME", name);
molecule.setProperty(CDKConstants.TITLE, name);
som.addAtomContainer(molecule);
if (name != null) {
molecule.setProperty("SMIdbNAME", name);
}
} catch (Exception exception) {
logger.warn("This SMILES could not be parsed: ", SMILES);
} catch (CDKException exception) {
logger.warn("This SMILES could not be parsed: ", line);
logger.warn("Because of: ", exception.getMessage());
logger.debug(exception);
IAtomContainer empty = som.getBuilder().newInstance(IAtomContainer.class, 0, 0, 0, 0);
empty.setProperty(IteratingSMILESReader.BAD_SMILES_INPUT, line);
som.addAtomContainer(empty);
}
if (input.ready()) {
line = input.readLine();
Expand All @@ -197,4 +198,19 @@ private IAtomContainerSet readAtomContainerSet(IAtomContainerSet som) {
public void close() throws IOException {
input.close();
}

/**
* Obtain the suffix after a line containing SMILES. The suffix follows
* any ' ' or '\t' termination characters.
*
* @param line input line
* @return the suffix - or an empty line
*/
private String suffix(final String line) {
for (int i = 0; i < line.length(); i++) {
char c = line.charAt(i);
if (c == ' ' || c == '\t') return line.substring(i + 1);
}
return "";
}
}
Expand Up @@ -22,18 +22,28 @@
* */
package org.openscience.cdk.io;

import java.io.InputStream;

import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import org.openscience.cdk.AtomContainerSet;
import org.openscience.cdk.ChemFile;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.interfaces.IAtomContainerSet;
import org.openscience.cdk.interfaces.IChemObjectBuilder;
import org.openscience.cdk.io.iterator.IteratingSMILESReader;
import org.openscience.cdk.silent.SilentChemObjectBuilder;
import org.openscience.cdk.tools.ILoggingTool;
import org.openscience.cdk.tools.LoggingToolFactory;

import java.io.InputStream;
import java.io.StringReader;

import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.notNullValue;
import static org.hamcrest.CoreMatchers.nullValue;
import static org.hamcrest.MatcherAssert.assertThat;

/**
* TestCase for the reading MDL mol files using one test file.
*
Expand Down Expand Up @@ -100,5 +110,23 @@ public void testReadingSmiFile_3() throws Exception {
IAtomContainerSet som = reader.read(new AtomContainerSet());
Assert.assertEquals(5, som.getAtomContainerCount());
}

@Test
public void badSmilesLine() throws CDKException {
IChemObjectBuilder bldr = SilentChemObjectBuilder.getInstance();
String input = "C\nn1cccc1\nc1ccccc1\n";
DefaultChemObjectReader cor = new SMILESReader(new StringReader(input));
IAtomContainerSet mols = cor.read(bldr.newInstance(IAtomContainerSet.class));
assertThat(mols.getAtomContainerCount(), is(3));
assertThat(mols.getAtomContainer(0).getAtomCount(), is(1));
assertThat(mols.getAtomContainer(0).getProperty(IteratingSMILESReader.BAD_SMILES_INPUT),
nullValue());
assertThat(mols.getAtomContainer(1).getAtomCount(), is(0));
assertThat(mols.getAtomContainer(1).getProperty(IteratingSMILESReader.BAD_SMILES_INPUT),
notNullValue());
assertThat(mols.getAtomContainer(2).getAtomCount(), is(6));
assertThat(mols.getAtomContainer(2).getProperty(IteratingSMILESReader.BAD_SMILES_INPUT),
nullValue());
}

}

0 comments on commit 6ded684

Please sign in to comment.