#!/usr/bin/env python2
"""CID from InchiKey flat file

Get ChemspiderIDentifiers (CID) from InchiKeys in a flat file.

Usage: python inchikey2cid.py [options] [source]

Options:
  -h, --help              show this help

Examples:
  inchikey2cid.py filename.tab.txt
"""

__author__ = "Joerg Kurt Wegner (http://miningdrugs.blogspot.com/)"
__copyright__ = "Copyright (c) 2009 Joerg Kurt Wegner"
__license__ = "Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)"

import httplib
import urlparse
import urllib
import urllib2
import base64
import re
import sys
import getopt
import socket

def usage():
    print __doc__

def getcid_from_inchikey(inchikey):
    timeout=5
    socket.setdefaulttimeout(timeout)
    cid='NA'
    for i in range(3):
        request = urllib2.Request('http://www.chemspider.com/InChIKey/'+inchikey)
        try: 
            response = urllib2.urlopen(request)
            the_page = response.read()
            cid_pattern = re.compile('Chemical-Structure\.(\d*)\.html')
            pattern_search=cid_pattern.search(the_page)
            if str(pattern_search)!='None':
                pattern_search.groups()
                if len(pattern_search.groups())==1:
        	          cid=pattern_search.groups()[0]
                break
        except urllib2.URLError, e:
            # just do another round
            cid='TimeOut'
    return cid

def main(argv):
    try:
        opts, args = getopt.getopt(argv, "h", ["help"])
    except getopt.GetoptError:
        usage()
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            sys.exit()
        elif opt == '-d':
            global _debug
            _debug = 1
    
    try:
        ifile = open(args[0], "r") 
    except IOError, e:
        print e
        sys.exit(2)
    
    while 1:
        lines = ifile.readlines(100000) #buffer lines for speeding things up
        if not lines:
            break
        for line in lines:
            lsplit = line.strip().split('\t')
            inchikey='NA'
            for lentry in lsplit:
                if len(lentry)==25 and lentry[14]:  # quick inchikey check
                    inchikey=lentry
            if inchikey=='NA':
                lsplit.append('NA')
            else:
                lsplit.append(getcid_from_inchikey(inchikey)) # slow, slow, slow ! TimeOut or NA ?
            new_line=''
            for index in range(len(lsplit)-1):
                new_line+=lsplit[index]+'\t'
            new_line+=lsplit[len(lsplit)-1]
            print new_line
                
if __name__ == "__main__":
    main(sys.argv[1:])
Code prettifier: google-code-prettify
Credits: Egon Willighagen for discussing the ChemSpider query option