Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Fixed merge conflicts manually
  • Loading branch information
egonw committed Nov 7, 2011
2 parents 68e45f4 + 6c3de72 commit c49322e
Show file tree
Hide file tree
Showing 11 changed files with 312 additions and 171 deletions.
Expand Up @@ -50,6 +50,7 @@
import net.bioclipse.core.ResourcePathTransformer;
import net.bioclipse.core.business.BioclipseException;
import net.bioclipse.core.domain.IMolecule;
import net.bioclipse.core.domain.IMolecule.Property;
import net.bioclipse.jobs.BioclipseUIJob;
import net.bioclipse.ui.business.IUIManager;

Expand All @@ -61,6 +62,7 @@
import org.eclipse.core.runtime.FileLocator;
import org.eclipse.core.runtime.IProgressMonitor;
import org.eclipse.core.runtime.IStatus;
import org.eclipse.core.runtime.NullProgressMonitor;
import org.eclipse.core.runtime.Path;
import org.eclipse.core.runtime.Status;
import org.eclipse.core.runtime.content.IContentType;
Expand Down Expand Up @@ -256,6 +258,48 @@ public void testLoadMoleculeFromSMILESFile() throws IOException,
}

}

@Test
public void testBug3055_SmilesProperties() throws Exception {
URI uri = getClass().getResource(
"/testFiles/molsforSMILESbug.smi" ).toURI();
URL url = FileLocator.toFileURL(uri.toURL());
String path = url.getFile();
List<ICDKMolecule> mols = cdk.loadSMILESFile(path);
assertNotNull( mols.get( 0 ).getProperty("name",
Property.USE_CACHED) );

String csvfile = "/testFiles/testsmi2sdf.smi";
InputStream stream = getClass().getResourceAsStream(csvfile);
MockIFile ifile=new MockIFile(stream).extension("smi");

mols = cdk.loadSMILESFile( ifile, new NullProgressMonitor() );

//Confirm all molecules are read
assertEquals(8, mols.size());

//Confirm properties are stored on first mol
assertEquals("842267", mols.get(0).getAtomContainer()
.getProperty("PUBCHEM_SID"));
assertEquals("", mols.get(0).getAtomContainer()
.getProperty("PUBCHEM_EXT_DATASOURCE_REGID"));
assertEquals("644526", mols.get(0).getAtomContainer()
.getProperty("PUBCHEM_CID"));
assertEquals("2", mols.get(0).getAtomContainer()
.getProperty("PUBCHEM_ACTIVITY_OUTCOME"));
assertEquals("26", mols.get(0).getAtomContainer()
.getProperty("PUBCHEM_ACTIVITY_SCORE"));
assertEquals("\"\"", mols.get(0).getAtomContainer()
.getProperty("PUBCHEM_ACTIVITY_URL"));
assertEquals("20100519", mols.get(0).getAtomContainer()
.getProperty("PUBCHEM_ASSAYDATA_COMMENT"));
assertEquals("\"\"", mols.get(0).getAtomContainer()
.getProperty("PUBCHEM_ASSAYDATA_REVOKE"));
assertEquals("123.22", mols.get(0).getAtomContainer()
.getProperty("1"));
assertEquals("10.2743", mols.get(0).getAtomContainer()
.getProperty("2"));
}

@Test
public void testloadMoleculesFromSMILESCheck() throws Exception {
Expand All @@ -274,6 +318,7 @@ public void testloadMoleculesFromSMILESCheck() throws Exception {

List<ICDKMolecule> molecules = cdk.loadSMILESFile(file);
Assert.assertNotNull( molecules );
Assert.assertEquals( input.length, molecules.size() );
List<String> inputList = new ArrayList<String>(Arrays.asList( input ));

for(ICDKMolecule molecule:molecules) {
Expand Down
@@ -0,0 +1,6 @@
smiles name class
C[NH+](C)CCC(=O)c1ccccc1 NSC_89 1
c1ccnc(c1)C(=O)[O-] NSC_171 -1
CC1CC(C(=O)C(C1)C(CC2CC(=O)NC(=O)C2)O)C NSC_185 1
CC1C(OC=C2C1=C(C(=O)C(=C2O)C(=O)[O-])C)C NSC_186 1
c1ccc(cc1)/C=C\2/C(=O)OC(=N2)c3ccccc3 NSC_291 1
@@ -0,0 +1,9 @@
SMILES,"PUBCHEM_SID","PUBCHEM_EXT_DATASOURCE_REGID","PUBCHEM_CID","PUBCHEM_ACTIVITY_OUTCOME","PUBCHEM_ACTIVITY_SCORE","PUBCHEM_ACTIVITY_URL","PUBCHEM_ASSAYDATA_COMMENT","PUBCHEM_ASSAYDATA_REVOKE",1,2
C1=CC=C(C=C1)OCCN2C3=CC=CC=C3SC2=O,842267,,644526,2,26,"",20100519,"",123.22,10.2743
CN\1C2=CC=CC=C2S/C1=C\C(=O)C3CCC3,842317,,5409720,2,30,"",20100519,"",137.55,10.0768
C1OC2=C(O1)C=C(C=C2)NC(=O)CSC3=NN=NN3CC4=CC=CC=C4,842386,,644654,2,18,"",20100519,"",84.67,3.7112
C1=CC=C(C=C1)NC(=O)C2=CC3=C(C=C2)N=C(S3)N,842516,,644786,2,29,"",20100519,"",137.13,5.14689
C1=CC=C(C=C1)OCC2=NN=C3N2N=C(S3)C4=CC=CS4,842734,,645009,2,17,"",20100519,"",80.71,14.5673
C1=CC=C(C=C1)C2=NC3=CC=CC=C3C4=NN=CN24,843032,,645307,2,26,"",20100519,"",123.13,7.53848
CC(=O)NC1=NC2=C(S1)C=C(C=C2)C(=O)NC,843041,,645316,1,4,"",20100519,"",22.68,0.948018
CC1=NN=C(C2=CC=CC=C12)NC3=CC=CC=N3,843042,,645317,2,31,"",20100519,"",142.71,3.81868
Expand Up @@ -146,6 +146,7 @@
import org.openscience.cdk.nonotify.NNMoleculeSet;
import org.openscience.cdk.nonotify.NoNotificationChemObjectBuilder;
import org.openscience.cdk.similarity.Tanimoto;
import org.openscience.cdk.smiles.DeduceBondSystemTool;
import org.openscience.cdk.smiles.SmilesGenerator;
import org.openscience.cdk.smiles.SmilesParser;
import org.openscience.cdk.smiles.smarts.SMARTSQueryTool;
Expand Down Expand Up @@ -1762,93 +1763,176 @@ public List<ICDKMolecule> loadSMILESFile( IFile file,
return loadSMILESFile( file.getContents(), monitor );
}

/**
* A simple implementation testing separator by splitting a line using a
* list of possible separators and returning the first one giving
* more than 1 parts.
*
* @param line Line to split
* @return a String separator, or null if none found
*/
private static String determineSeparator(String line) {
String[] POSSIBLE_SEPARATORS=new String[]{",","\t"," "};
for (int i = 0; i< POSSIBLE_SEPARATORS.length; i++){
String[] splits = line.split(POSSIBLE_SEPARATORS[i]);
if (splits.length>1)
return POSSIBLE_SEPARATORS[i];
}
return null;
}

public List<ICDKMolecule> loadSMILESFile( InputStream contents,
IProgressMonitor monitor )
throws CoreException, IOException {


BufferedInputStream buf = new BufferedInputStream(contents);
InputStreamReader reader = new InputStreamReader(buf);
BufferedReader br = new BufferedReader(reader);
BufferedReader breader = new BufferedReader(reader);

if ( !br.ready() ) {
if ( !breader.ready() ) {
throw new IOException("Input was not ready to be read.");
}
List<ICDKMolecule> molecules = new ArrayList<ICDKMolecule>();
DeduceBondSystemTool bondSystemTool = new DeduceBondSystemTool();
List<String> lines = new LinkedList<String>();
for ( String line = breader.readLine() ;
line != null ;
line = breader.readLine() ) {
lines.add( line );
}
breader.close();

try {

int noLines = lines.size();

logger.debug("Number of lines in file: " + noLines);

monitor.beginTask("Converting SMILES file to SDF", noLines);

String firstLine = lines.remove( 0 );

class StringPair {
final String first;
final String second;
StringPair(String first,String second) {
this.first = first;
this.second = second;
if (firstLine==null)
throw new IOException("First line is null!");

logger.debug("Header line is: " + firstLine);

//Determine separator from first line
String separator=determineSeparator(firstLine);

//First line is header
String[] headers;
if ( separator == null) {
// no separator so assuming only a SMILES string on each row
headers = new String[] {"smiles"};
}
};

String line = br.readLine();

if (line == null)
throw new IOException("Input had null content");
int cnt = 0;
List<StringPair> list = new LinkedList<StringPair>();
while (line != null) {
// System.out.println("Line " + cnt + ": " + line);
Scanner smilesScanner = new Scanner(line).useDelimiter("\\s+");
String part1 = null;
String part2 = null;
if (smilesScanner.hasNext()) {
part1 = smilesScanner.next();
if (smilesScanner.hasNext()) {
part2 = smilesScanner.next();
}
else {
headers = firstLine.split(separator);
}


// or is it?
boolean haveHeaders = false;
try {
fromSMILES( headers.length == 1 ? firstLine
: headers[0] );
}
if (part1 != null) {
if (part2 != null) {
list.add( new StringPair(part1,part2) );
}else{
list.add( new StringPair(part1,"entry-" + cnt) );
catch (BioclipseException e) {
// well at least it's not SMILES so suppose it's headers
haveHeaders = true;
}

if (!haveHeaders) {
lines.add( 0, firstLine );
if ( headers.length != 1 ) {
headers = new String[] {"smiles", "identifier"};
}
// System.out
// .println(" - " + part1 + " -> " + entries.get(part1));
}
// Get next line
line = br.readLine();
cnt++;
}
// Depict where the smiles are, in first or second
boolean smilesInFirst = true;
String firstKey = list.get( 0 ).first;
String firstVal = list.get( 0 ).second;
ICDKMolecule mol = null;
try {
mol = fromSMILES(firstKey);
} catch (BioclipseException e) {
}
if (mol == null) {
try {
mol = fromSMILES(firstVal);
smilesInFirst = false;
} catch (BioclipseException e) {

//Strip headers of " and spaces
for (int i=0; i< headers.length; i++){
headers[i]=headers[i].trim();
if (headers[i].startsWith("\""))
headers[i] = headers[i].substring(1);
if (headers[i].endsWith("\""))
headers[i] = headers[i].substring(
0,
headers[i].length() - 1 );
}
}
List<ICDKMolecule> mols = new RecordableList<ICDKMolecule>();
for (StringPair part : list) {
if (smilesInFirst) {

//Read subsequent lines until end
int lineno=2;
for (String line : lines) {

if (monitor.isCanceled())
return null;

String[] parts = headers.length > 1 ? line.split(separator)
: new String[] {line};

//Assert header is same size as data
if (parts.length!=headers.length)
throw new BioclipseException("Header and data have " +
"different number of columns. " +
"Header size=" + headers.length +
"Line " + lineno + " size=" + parts.length );

//Part 1 is expected to be SMILES
String smiles=parts[0];

//Create a new CDKMolecule from smiles
ICDKMolecule mol = fromSMILES(smiles);

try {
mol = fromSMILES(part.first);
mol.setName(part.second);
mols.add(mol);
} catch (BioclipseException e) {
org.openscience.cdk.interfaces.IMolecule newAC
= bondSystemTool.fixAromaticBondOrders(
(org.openscience.cdk.interfaces.IMolecule)
mol.getAtomContainer() );
mol = new CDKMolecule(newAC);
}
catch (CDKException e) {
logger.error("Could not deduce bond orders for mol: " + mol);
}
} else {
try {
mol = fromSMILES(part.second);
mol.setName(part.first);
mols.add(mol);
} catch (BioclipseException e) {

//Store rest of parts as properties on mol
for (int i=1; i<headers.length;i++){
mol.getAtomContainer().setProperty(headers[i], parts[i]);
}

//Filter molecules with failing atom types
boolean filterout=false;
for (IAtom atom : mol.getAtomContainer().atoms()){
if (atom.getAtomTypeName()==null ||
atom.getAtomTypeName().equals("X"))
filterout=true;
}

if (filterout)
logger.debug("Skipped molecule " + lineno + " due to " +
"failed atom typing.");
else
molecules.add(mol);

//Read next line
lineno++;

monitor.worked(1);
if (lineno%100==0){
if (monitor.isCanceled())
return null;
monitor.subTask("Processed: " + lineno + "/" + noLines);
}
}
} catch (IOException e) {
e.printStackTrace();
} catch (BioclipseException e) {
e.printStackTrace();
}finally{
monitor.done();
}
return mols;
logger.debug("Read " + molecules.size() +" molecules.");
return molecules;
}

public int getNoMolecules(IFile file)
Expand Down
Expand Up @@ -901,6 +901,13 @@ public List<ICDKMolecule> extractFromSDFile( IFile file,
throws BioclipseException, InvocationTargetException;

@Recorded
@TestMethods("testExtractFromSDFile_IFile_int_int")
public List<ICDKMolecule> extractFromSDFile( IFile file,
int startentry,
int endentry,
IProgressMonitor monitor)
throws BioclipseException, InvocationTargetException;
@Recorded
@PublishedMethod(
params = "String file, int startentry, int endentry",
methodSummary = "Extracts a number of entries from an sd file. " +
Expand Down Expand Up @@ -1176,7 +1183,8 @@ public void saveSDFile( IFile molFile, List<? extends IMolecule> mols,
@PublishedMethod(
params="List<IMolecule> mols, double firstRatio",
methodSummary="Split a list of molecules in 2 parts by a ratio.")
public List<List<IMolecule>> randomSplit2parts(List<IMolecule> mols_in, double firstRatio);
public List<List<IMolecule>> randomSplit2parts(List<IMolecule> mols_in, double firstRatio);


public List<ICDKMolecule> loadSMILESFile( IFile file,
IProgressMonitor monitor );
}

0 comments on commit c49322e

Please sign in to comment.