Skip to content

Commit

Permalink
Added more unit tests for the MDLV2000Writer - including corner cases…
Browse files Browse the repository at this point in the history
… such as multiline RGPs, atom aliases, truncated atom aliases and pseudo atoms with null labels. The MDLV2000Writer can now handle all of these cases correctly.

Change-Id: I3d134d23fc6252d7425fa2a8c50fb308d058ec7d
Signed-off-by: Stephan Beisken <beisken@ebi.ac.uk>
Signed-off-by: Egon Willighagen <egonw@users.sourceforge.net>
  • Loading branch information
Stephan Beisken authored and egonw committed Dec 2, 2012
1 parent 01f8db4 commit e3b1c8e
Show file tree
Hide file tree
Showing 2 changed files with 206 additions and 29 deletions.
130 changes: 101 additions & 29 deletions src/main/org/openscience/cdk/io/MDLV2000Writer.java
Expand Up @@ -33,9 +33,14 @@
import java.text.NumberFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.openscience.cdk.CDKConstants;
import org.openscience.cdk.annotations.TestClass;
Expand Down Expand Up @@ -97,6 +102,9 @@ public class MDLV2000Writer extends DefaultChemObjectWriter {
private final static ILoggingTool logger =
LoggingToolFactory.createLoggingTool(MDLV2000Writer.class);

// regular expression to capture R groups with attached numbers
private Pattern NUMERED_R_GROUP = Pattern.compile("R(\\d+)");

private BooleanIOSetting forceWriteAs2DCoords;

// The next two options are MDL Query format options, not really
Expand Down Expand Up @@ -247,7 +255,8 @@ private void writeChemFile(IChemFile file) throws Exception {
*/
public void writeMolecule(IAtomContainer container) throws Exception {
String line = "";
List<Integer> rgroupList=null;
Map<Integer,Integer> rgroups = null;
Map<Integer,String> aliases = null;
// write header block
// lines get shortened to 80 chars, that's in the spec
String title = (String)container.getProperty(CDKConstants.TITLE);
Expand Down Expand Up @@ -307,18 +316,54 @@ public void writeMolecule(IAtomContainer container) throws Exception {
if(container.getAtom(f) instanceof IPseudoAtom){
//according to http://www.google.co.uk/url?sa=t&ct=res&cd=2&url=http%3A%2F%2Fwww.mdl.com%2Fdownloads%2Fpublic%2Fctfile%2Fctfile.pdf&ei=MsJjSMbjAoyq1gbmj7zCDQ&usg=AFQjCNGaJSvH4wYy4FTXIaQ5f7hjoTdBAw&sig2=eSfruNOSsdMFdlrn7nhdAw an R group is written as R#
IPseudoAtom pseudoAtom = (IPseudoAtom) container.getAtom(f);
if (pseudoAtom.getSymbol().equals("R") && pseudoAtom.getLabel().length()>1) {
line += "R# ";
if (rgroupList==null) {
rgroupList = new ArrayList<Integer>();
String label = pseudoAtom.getLabel();
if(label == null) // set to empty string if null
label = "";

// firstly check if it's a numbered R group
Matcher matcher = NUMERED_R_GROUP.matcher(label);
if (pseudoAtom.getSymbol().equals("R")
&& !label.isEmpty()
&& matcher.matches()) {

line += "R# ";
if (rgroups==null) {
// we use a tree map to ensure the output order is always the same
rgroups = new TreeMap<Integer, Integer>();
}
Integer rGroupNumber = new Integer(pseudoAtom.getLabel().substring(1));
rgroupList.add(f+1);
rgroupList.add(rGroupNumber);

rgroups.put(f + 1, Integer.parseInt(matcher.group(1)));

}
else
line += formatMDLString(((IPseudoAtom) container.getAtom(f)).getLabel(), 3);
// not a numbered R group - note the symbol may still be R
else {

// note: no distinction made between alias and pseudo atoms - normally
// aliases maintain their original symbol while pseudo atoms are
// written with a 'A' in the atom block

// if the label is longer then 3 characters we need
// to use an alias.
if(label.length() > 3) {

if(aliases == null)
aliases = new TreeMap<Integer, String>();


aliases.put(f + 1, label); // atom index to alias

line += formatMDLString(atom.getSymbol(), 3);

} else { // label is short enough to fit in the atom block

// make sure it's not empty
if(!label.isEmpty())
line += formatMDLString(label, 3);
else
line += formatMDLString(atom.getSymbol(), 3);

}
}

}else{
line += formatMDLString(container.getAtom(f).getSymbol(), 3);
}
Expand Down Expand Up @@ -454,24 +499,51 @@ else if (Order.QUADRUPLE == bond.getOrder())
}

//write RGP line (max occurrence is 16 data points per line)
if (rgroupList!=null) {
StringBuffer rgpLine=new StringBuffer();
int cnt=0;
for (int i=1; i<= rgroupList.size(); i++) {

rgpLine.append(formatMDLInt((rgroupList.get(i-1)), 4));
i++;
rgpLine.append(formatMDLInt((rgroupList.get(i-1)), 4));

cnt++;
if (i==rgroupList.size() || i==16 ) {
rgpLine.insert(0, "M RGP"+formatMDLInt(cnt, 3));
writer.write(rgpLine.toString());
writer.newLine();
rgpLine=new StringBuffer();
cnt=0;
}
}
if (rgroups!=null) {
StringBuilder rgpLine=new StringBuilder();
int cnt = 0;

// the order isn't guarantied but as we index with the atom
// number this isn't an issue
for(Map.Entry<Integer,Integer> e : rgroups.entrySet()) {
rgpLine.append(formatMDLInt(e.getKey(), 4));
rgpLine.append(formatMDLInt(e.getValue(), 4));
cnt++;
if(cnt == 8){
rgpLine.insert(0, "M RGP" + formatMDLInt(cnt, 3));
writer.write(rgpLine.toString());
writer.newLine();
rgpLine = new StringBuilder();
cnt = 0;
}
}
if(cnt != 0) {
rgpLine.insert(0, "M RGP" + formatMDLInt(cnt, 3));
writer.write(rgpLine.toString());
writer.newLine();
}


}

// write atom aliases
if(aliases != null){

for (Map.Entry<Integer, String> e : aliases.entrySet()){

writer.write("A" + formatMDLInt(e.getKey(), 5));
writer.newLine();

String label = e.getValue();

// fixed width file - doubtful someone would have a label > 70 but trim if they do
if(label.length() > 70)
label = label.substring(0, 70);

writer.write(label);
writer.newLine();

}
}

// close molecule
Expand Down
105 changes: 105 additions & 0 deletions src/test/org/openscience/cdk/io/MDLV2000WriterTest.java
Expand Up @@ -392,4 +392,109 @@ public void testWritePseudoAtoms() throws Exception {
Assert.assertTrue(output.indexOf("Leu") != -1);
}


/**
* @cdk.bug 1263
* @throws Exception
*/
@Test
public void testWritePseudoAtoms_LongLabel() throws Exception {

IChemObjectBuilder builder = DefaultChemObjectBuilder.getInstance();
IAtomContainer container = builder.newInstance(IAtomContainer.class);

IAtom c1 = builder.newInstance(IAtom.class, "C");
IAtom tRNA = builder.newInstance(IPseudoAtom.class, "tRNA");

container.addAtom(c1);
container.addAtom(tRNA);

StringWriter sw = new StringWriter();
MDLV2000Writer writer = new MDLV2000Writer(sw);
writer.write(container);

String output = sw.toString();

Assert.assertTrue(output.contains("A 2"));
Assert.assertTrue(output.contains("tRNA"));


}

/**
* Checks that null atom labels are handled correctly.
*/
@Test
public void testWritePseudoAtoms_nullLabel() throws Exception {

IChemObjectBuilder builder = DefaultChemObjectBuilder.getInstance();
IAtomContainer container = builder.newInstance(IAtomContainer.class);

IAtom c1 = builder.newInstance(IAtom.class, "C");
IPseudoAtom nullAtom = builder.newInstance(IPseudoAtom.class, "");
nullAtom.setLabel(null);

container.addAtom(c1);
container.addAtom(nullAtom);

StringWriter sw = new StringWriter();
MDLV2000Writer writer = new MDLV2000Writer(sw);
writer.write(container);

String output = sw.toString();
Assert.assertTrue(output.contains("R"));

}

/**
* When there are more then 16 R Groups these should be wrapped
* @throws Exception
*/
@Test
public void testRGPLine_Multiline() throws Exception {

IChemObjectBuilder builder = DefaultChemObjectBuilder.getInstance();
IAtomContainer container = builder.newInstance(IAtomContainer.class);

for(int i = 1; i < 20; i++)
container.addAtom(builder.newInstance(IPseudoAtom.class,
"R" + i));

StringWriter sw = new StringWriter();
MDLV2000Writer writer = new MDLV2000Writer(sw);
writer.write(container);

String output = sw.toString();
Assert.assertTrue(output.contains("M RGP 8 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8"));
Assert.assertTrue(output.contains("M RGP 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 16"));
Assert.assertTrue(output.contains("M RGP 3 17 17 18 18 19 19"));

}

@Test
public void testAlaias_TruncatedLabel() throws Exception {

IChemObjectBuilder builder = DefaultChemObjectBuilder.getInstance();
IAtomContainer container = builder.newInstance(IAtomContainer.class);

String label = "This is a very long label - almost too long. it should be cut here -> and the rest is truncated";

container.addAtom(builder.newInstance(IPseudoAtom.class,
label));

StringWriter sw = new StringWriter();
MDLV2000Writer writer = new MDLV2000Writer(sw);
writer.write(container);

String output = sw.toString();

Assert.assertTrue(output.contains("This is a very long label - almost too long. it should be cut here ->"));
// make sure the full label wasn't output
Assert.assertFalse(output.contains(label));


}



}

0 comments on commit e3b1c8e

Please sign in to comment.