Author: mes Date: 2012-01-31 11:49:21 -0800 (Tue, 31 Jan 2012) New Revision: 28169
Modified: core3/impl/trunk/psi-mi-impl/impl/src/main/java/org/cytoscape/psi_mi/internal/plugin/MITABLine.java core3/impl/trunk/psi-mi-impl/impl/src/main/java/org/cytoscape/psi_mi/internal/plugin/PsiMiTabParser.java Log: updated PSI-MI tab parsing to be much faster Modified: core3/impl/trunk/psi-mi-impl/impl/src/main/java/org/cytoscape/psi_mi/internal/plugin/MITABLine.java =================================================================== --- core3/impl/trunk/psi-mi-impl/impl/src/main/java/org/cytoscape/psi_mi/internal/plugin/MITABLine.java 2012-01-31 19:47:23 UTC (rev 28168) +++ core3/impl/trunk/psi-mi-impl/impl/src/main/java/org/cytoscape/psi_mi/internal/plugin/MITABLine.java 2012-01-31 19:49:21 UTC (rev 28169) @@ -22,7 +22,16 @@ // 12 sourceDB|sourceDB // 13 interactionID|XXXX // 14 edgeScoreType:edgeScoreString|edgeScoreType:edgeScoreString +// subsequent optional columns are ignored! +// +// For a better description see: http://wiki.reactome.org/index.php/PSI-MITAB_interactions +// + +/** + * This class quickly reads a single line of PSI-MI Tab delimited format into a data + * structure for easy processing. + */ public class MITABLine { final char COLON = ':'; @@ -101,7 +110,7 @@ } - void readLine(String line) { + public void readLine(String line) { init(); // column 0 Modified: core3/impl/trunk/psi-mi-impl/impl/src/main/java/org/cytoscape/psi_mi/internal/plugin/PsiMiTabParser.java =================================================================== --- core3/impl/trunk/psi-mi-impl/impl/src/main/java/org/cytoscape/psi_mi/internal/plugin/PsiMiTabParser.java 2012-01-31 19:47:23 UTC (rev 28168) +++ core3/impl/trunk/psi-mi-impl/impl/src/main/java/org/cytoscape/psi_mi/internal/plugin/PsiMiTabParser.java 2012-01-31 19:49:21 UTC (rev 28169) @@ -41,25 +41,30 @@ private final static Pattern miNamePttr = Pattern.compile("\\(.+\\)"); private static final String TAB = "\t"; - private static final String INTERACTION = "interaction"; - // Attr Names - private static final String DETECTION_METHOD = ATTR_PREFIX + "interaction detection method"; + // Node Attr Names + private static final String INTERACTOR_TYPE = ATTR_PREFIX + "interactor type"; + private static final String ALIASES = ATTR_PREFIX + "aliases"; + private static final String TAXONIDS = ATTR_PREFIX + "taxon ID"; + private static final String TAXONDBS = ATTR_PREFIX + "taxon DB"; + + // Edge Attr Names + private static final String INTERACTION = CyEdge.INTERACTION; // should already exist + private static final String DETECTION_METHOD_ID = ATTR_PREFIX + "detection method ID"; + private static final String DETECTION_METHOD = ATTR_PREFIX + "detection method"; private static final String INTERACTION_TYPE = ATTR_PREFIX + "interaction type"; - private static final String SOURCE_DB = ATTR_PREFIX + "source database"; - private static final String INTERACTION_ID = ATTR_PREFIX + "Interaction ID"; - private static final String EDGE_SCORE = ATTR_PREFIX + "confidence score"; + private static final String INTERACTION_TYPE_ID = ATTR_PREFIX + "interaction type ID"; + private static final String SOURCE_DB = ATTR_PREFIX + "source DB"; + private static final String EDGE_SCORE = ATTR_PREFIX + "edge score"; + private static final String AUTHORS = ATTR_PREFIX + "authors"; + private static final String PUBLICATION_ID = ATTR_PREFIX + "publication ID"; + private static final String PUBLICATION_DB = ATTR_PREFIX + "publication DB"; // Stable IDs which maybe used for mapping later - private static final String UNIPROT = "uniprotkb"; - private static final String ENTREZ_GENE = "entrezgene/locuslink"; - private static final String ENTREZ_GENE_SYN = "entrez gene/locuslink"; - private static final String CHEBI = "chebi"; - - private static final String INTERACTOR_TYPE = ATTR_PREFIX + "interactor type"; private static final String COMPOUND = "compound"; + private Matcher matcher; private Map<String, CyNode> nodeMap; @@ -75,8 +80,10 @@ } public CyNetwork parse(final TaskMonitor taskMonitor) throws IOException { - + long start = System.currentTimeMillis(); + + taskMonitor.setProgress(-1.0); this.nodeMap = new HashMap<String, CyNode>(); @@ -94,31 +101,13 @@ final CyNetwork network = cyNetworkFactory.createNetwork(); - final CyTable nodeTable = network.getDefaultNodeTable(); - if (nodeTable.getColumn(INTERACTOR_TYPE) == null) - nodeTable.createColumn(INTERACTOR_TYPE, String.class, false); - if (nodeTable.getColumn(INTERACTOR_TYPE + ".name") == null) - nodeTable.createColumn(INTERACTOR_TYPE + ".name", String.class, false); + initColumns(network); - final CyTable edgeTable = network.getDefaultEdgeTable(); - if (edgeTable.getColumn(INTERACTION_ID) == null) - edgeTable.createColumn(INTERACTION_ID, String.class, false); - if (edgeTable.getColumn(INTERACTION_TYPE) == null) { - edgeTable.createListColumn(INTERACTION_TYPE, String.class, false); - edgeTable.createListColumn(INTERACTION_TYPE + ".name", String.class, false); - } - if (edgeTable.getColumn(DETECTION_METHOD) == null) { - edgeTable.createListColumn(DETECTION_METHOD, String.class, false); - edgeTable.createListColumn(DETECTION_METHOD + ".name", String.class, false); - } - if (edgeTable.getColumn(SOURCE_DB) == null) - edgeTable.createListColumn(SOURCE_DB, String.class, false); - if (edgeTable.getColumn(EDGE_SCORE) == null) - edgeTable.createListColumn(EDGE_SCORE, Double.class, false); - String line; final BufferedReader br = new BufferedReader(new InputStreamReader(inputStream), BUFFER_SIZE); + MITABLine mline = new MITABLine(); + long interactionCount = 0; while ((line = br.readLine()) != null) { @@ -132,19 +121,13 @@ continue; try { + + mline.readLine(line); - //aaa:bbb:ccc|aa:bb:cc|a:b:c<tab> - entry = line.split(TAB); + final String sourceRawID = mline.sourceRawID; + final String targetRawID = mline.targetRawID; - // Validate entry list. - if (entry == null || entry.length < COLUMN_COUNT) - continue; - - sourceID = entry[0].split(SEPARATOR); - targetID = entry[1].split(SEPARATOR); - final String sourceRawID = sourceID[0].split(SUBSEPARATOR)[1]; - final String targetRawID = targetID[0].split(SUBSEPARATOR)[1]; - + // create nodes CyNode source = nodeMap.get(sourceRawID); if (source == null) { source = network.addNode(); @@ -156,55 +139,48 @@ nodeMap.put(targetRawID, target); } - network.getRow(source).set(CyTableEntry.NAME, sourceRawID); - network.getRow(target).set(CyTableEntry.NAME, targetRawID); + CyRow sourceRow = network.getRow(source); + CyRow targetRow = network.getRow(target); - // Set type if not protein - if (sourceID[0].contains(CHEBI)) - network.getRow(source).set(INTERACTOR_TYPE, COMPOUND); - if (targetID[0].contains(CHEBI)) - network.getRow(target).set(INTERACTOR_TYPE, COMPOUND); + // set various node attrs + sourceRow.set(CyTableEntry.NAME, sourceRawID); + targetRow.set(CyTableEntry.NAME, targetRawID); - // Aliases - setAliases(network.getRow(source), entry[0].split(SEPARATOR)); - setAliases(network.getRow(target), entry[1].split(SEPARATOR)); - setAliases(network.getRow(source), entry[2].split(SEPARATOR)); - setAliases(network.getRow(target), entry[3].split(SEPARATOR)); - setAliases(network.getRow(source), entry[4].split(SEPARATOR)); - setAliases(network.getRow(target), entry[5].split(SEPARATOR)); + setInteractorType(sourceRow,mline.srcAliases); + setInteractorType(targetRow,mline.tgtAliases); - // Tax ID (pick first one only) - setTaxID(network.getRow(source), entry[9].split(SEPARATOR)[0]); - setTaxID(network.getRow(target), entry[10].split(SEPARATOR)[0]); + setAliases(sourceRow, mline.srcAliases, mline.srcDBs); + setAliases(targetRow, mline.tgtAliases, mline.tgtDBs); - sourceDB = entry[12].split(SEPARATOR); - interactionID = entry[13].split(SEPARATOR); + setTaxID(sourceRow, mline.srcTaxonIDs, mline.srcTaxonDBs); + setTaxID(targetRow, mline.tgtTaxonIDs, mline.tgtTaxonDBs); - edgeScore = entry[14].split(SEPARATOR); - - detectionMethods = entry[6].split(SEPARATOR); - interactionType = entry[11].split(SEPARATOR); - + // create edge final CyEdge e = network.addEdge(source, target, true); - network.getRow(e).set(INTERACTION, interactionID[0]); + CyRow edgeRow = network.getRow(e); + + // set various edge attrs + String interactionId = "unknown"; + if ( mline.interactionIDs.size() > 0 ) + interactionId = mline.interactionIDs.get(0); - setEdgeListAttribute(network.getRow(e), interactionType, INTERACTION_TYPE); - setEdgeListAttribute(network.getRow(e), detectionMethods, DETECTION_METHOD); - setEdgeListAttribute(network.getRow(e), sourceDB, SOURCE_DB); + edgeRow.set(INTERACTION, interactionId); + edgeRow.set(CyTableEntry.NAME, sourceRawID + " (" + interactionId + ") " + targetRawID); - // Map scores - setEdgeScoreListAttribute(network.getRow(e), edgeScore, EDGE_SCORE); + setTypedEdgeListAttribute(edgeRow, mline.interactionTypes, INTERACTION_TYPE_ID, INTERACTION_TYPE); + setTypedEdgeListAttribute(edgeRow, mline.detectionMethods, DETECTION_METHOD_ID, DETECTION_METHOD); + setEdgeListAttribute(edgeRow, mline.sourceDBs, SOURCE_DB); + setEdgeListAttribute(edgeRow, mline.edgeScoreStrings, EDGE_SCORE); - network.getRow(e).set(INTERACTION_ID, interactionID[0]); - - setPublication(network.getRow(e), entry[8].split(SEPARATOR), entry[7].split(SEPARATOR)); + setPublication(edgeRow, mline.publicationValues, mline.publicationDBs); + setAuthors(edgeRow, mline.authors); -// interactionCount++; -// taskMonitor.setStatusMessage(interactionCount + " interactions loaded."); } catch (Exception ex) { logger.warn("Could not parse this line: " + line, ex); continue; } + if ( ++interactionCount % 100 == 0 ) + taskMonitor.setStatusMessage("parsed " + interactionCount + " interactions"); } br.close(); @@ -216,113 +192,72 @@ return network; } - private void setTaxID(CyRow row, String value) { - String[] buf = value.split(SUBSEPARATOR, 2); - String attrName; - String taxonName; - if (buf != null && buf.length == 2) { - attrName = ATTR_PREFIX + buf[0]; + private void setTaxID(CyRow row, List<String> taxonIDs, List<String> taxonDBs) { + row.set(TAXONIDS,taxonIDs); + row.set(TAXONDBS,taxonDBs); + } - if (row.getTable().getColumn(attrName) == null) { - row.getTable().createColumn(attrName, String.class, false); - row.getTable().createColumn(attrName + ".name", String.class, false); - } - - matcher = miNamePttr.matcher(buf[1]); - if (matcher.find()) { - taxonName = matcher.group(); - row.set(attrName, buf[1].split("\\(")[0]); - row.set(attrName + ".name", taxonName.substring(1, taxonName.length() - 1)); - } else { - row.set(attrName, buf[1]); - } + private void setPublication(CyRow row, List<String> pubID, List<String> pubDB) { + for ( int i = 0; i < pubID.size(); i++ ) { + listAttrMapper(row, PUBLICATION_ID, pubID.get(i)); + listAttrMapper(row, PUBLICATION_DB, pubDB.get(i)); } } - private void setPublication(CyRow row, String[] pubID, String[] authors) { - String key = null; - String[] temp; - - for (String val : pubID) { - temp = val.split(SUBSEPARATOR, 2); - if (temp == null || temp.length < 2) - continue; - - key = ATTR_PREFIX + temp[0]; - listAttrMapper(row, key, temp[1]); - } - + private void setAuthors(CyRow row, List<String> authors) { for (String val : authors) { - key = ATTR_PREFIX + "author"; - listAttrMapper(row, key, val); + listAttrMapper(row, AUTHORS, val); } } - private void setAliases(CyRow row, String[] entry) { - String key = null; - String[] temp; - String value; - - for (String val : entry) { - temp = val.split(SUBSEPARATOR, 2); - if (temp == null || temp.length < 2) - continue; - - key = ATTR_PREFIX + temp[0]; - value = temp[1].replaceAll("\\(.+\\)", ""); - listAttrMapper(row, key, value); + private void setAliases(CyRow row, List<String> aliases, List<String> aliasDBs) { + for ( String s : aliases ) { + int ind = s.indexOf('('); + if ( ind > 0 ) + s = s.substring(0,ind); + listAttrMapper(row, ALIASES, s); } } - private void setEdgeListAttribute(CyRow row, String[] entry, String key) { - - String value; - String name; - + private void setEdgeListAttribute(CyRow row, List<String> entry, String key) { for (String val : entry) { - value = trimPSITerm(val); - name = trimPSIName(val); - - listAttrMapper(row, key, value); - listAttrMapper(row, key + ".name", name); + listAttrMapper(row, key, val); } } - - // Special case for edge scores - private void setEdgeScoreListAttribute(CyRow row, String[] entry, String key) { - - String scoreString; - String scoreType; - + private void setTypedEdgeListAttribute(CyRow row, List<String> entry, String idKey, String descKey) { for (String val : entry) { - final String[] parts = val.split(SUBSEPARATOR); - if (parts == null || parts.length != 2) - continue; + String id = ""; + String desc = ""; - scoreString = parts[1]; - scoreType = parts[0]; - final String colName = key + "." + scoreType; + // Extract description between parens. + int openParen = val.indexOf('('); + if ( openParen >= 0 ) { + int closeParen = val.indexOf(')'); + if ( closeParen > openParen) + desc = val.substring(openParen+1,closeParen); + } - if (row.getTable().getColumn(colName) == null) - row.getTable().createListColumn(colName, Double.class, false); + // Extract ID between quotes. + int firstQuote = val.indexOf('"'); + if ( firstQuote >= 0 ) { + int secondQuote = val.indexOf('"',firstQuote+1); + if ( secondQuote > firstQuote ) { + id = val.substring(firstQuote+1,secondQuote); + } + } - try { - final Double score = Double.parseDouble(scoreString); - row.set(key + "." + scoreType, score); - } catch (Exception e) { - // if (scoreString != null - // && scoreString.trim().equals("") == false) - // row.set(key + "." + scoreType, scoreString); - - continue; + // If we can't parse properly, just shove the whole + // thing in description. + if ( desc.equals("") || id.equals("") ) { + listAttrMapper(row, descKey, val); + } else { + listAttrMapper(row, idKey, id); + listAttrMapper(row, descKey, desc); } } } private void listAttrMapper(CyRow row, String attrName, String value) { - if (row.getTable().getColumn(attrName) == null) - row.getTable().createListColumn(attrName, String.class, false); - List<String> currentAttr = row.getList(attrName, String.class); if (currentAttr == null) { @@ -335,35 +270,6 @@ } } - private String trimPSITerm(String original) { - String miID = null; - - matcher = miPttr.matcher(original); - - if (matcher.find()) { - miID = matcher.group(); - } else { - miID = "-"; - } - - return miID; - } - - private String trimPSIName(String original) { - String miName = null; - - matcher = miNamePttr.matcher(original); - - if (matcher.find()) { - miName = matcher.group(); - miName = miName.substring(1, miName.length() - 1); - } else { - miName = "-"; - } - - return miName; - } - public void cancel() { cancelFlag = true; } @@ -374,4 +280,34 @@ nodeMap = null; } + private void setInteractorType(CyRow row, List<String> aliases) { + // Set type if not protein + if (aliases.contains(CHEBI)) + row.set(INTERACTOR_TYPE, COMPOUND); + } + + private void initColumns(CyNetwork network) { + final CyTable nodeTable = network.getDefaultNodeTable(); + createListColumn(nodeTable,INTERACTOR_TYPE,String.class); + createListColumn(nodeTable,ALIASES,String.class); + createListColumn(nodeTable,TAXONIDS,String.class); + createListColumn(nodeTable,TAXONDBS,String.class); + + final CyTable edgeTable = network.getDefaultEdgeTable(); + createListColumn(edgeTable,INTERACTION_TYPE,String.class); + createListColumn(edgeTable,INTERACTION_TYPE_ID,String.class); + createListColumn(edgeTable,DETECTION_METHOD,String.class); + createListColumn(edgeTable,DETECTION_METHOD_ID,String.class); + createListColumn(edgeTable,SOURCE_DB,String.class); + createListColumn(edgeTable,EDGE_SCORE,String.class); + createListColumn(edgeTable,AUTHORS,String.class); + createListColumn(edgeTable,PUBLICATION_ID,String.class); + createListColumn(edgeTable,PUBLICATION_DB,String.class); + } + + private void createListColumn(CyTable table, String colName, Class<?> type) { + if ( table.getColumn(colName) == null ) + table.createListColumn(colName,String.class,false); + } + } -- You received this message because you are subscribed to the Google Groups "cytoscape-cvs" group. To post to this group, send email to [email protected]. To unsubscribe from this group, send email to [email protected]. For more options, visit this group at http://groups.google.com/group/cytoscape-cvs?hl=en.
