Add convertKineticsLibraryToTrainingReactions script

connie · connie · commit 70daf53b33ab · 2016-02-28T22:15:06.000-05:00
Specify the kinetics library name below and run the script.
It automatically overwrites the training reactions files it needs to.
 Then you should commit those files.

This script only trains safely. In other words, if a single match
from an RMG family is found, a training reaction is created. Sometimes,
 there are no matches from RMG reaction families, or multiple matches.
This indicates an error that requires manual fixing, and a printout is
 given in the script.
diff --git a/scripts/convertKineticsLibraryToTrainingReactions.ipynb b/scripts/convertKineticsLibraryToTrainingReactions.ipynb
@@ -0,0 +1,394 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Convert Kinetics Library to Training Reactions Script\n",
+    "\n",
+    "Specify the kinetics library name below and run the script.  It automatically overwrites the training reactions files it needs to.  Then you should commit those files.\n",
+    "\n",
+    "This script only trains safely.  In other words, if a single match from an RMG family is found, a training reaction is created.  Sometimes, there are no matches from RMG reaction families, or multiple matches.  This indicates an error that requires manual fixing, and a printout is given in the script."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "libraryName = 'vinylCPD_H'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from rmgpy.data.rmg import RMGDatabase\n",
+    "from rmgpy.chemkin import saveChemkinFile, saveSpeciesDictionary\n",
+    "from rmgpy.rmg.model import Species\n",
+    "from rmgpy import settings\n",
+    "from convertKineticsLibraryToTrainingReactions import addAtomLabelsForReaction"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## load lib_rxn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "database = RMGDatabase()\n",
+    "database.load(settings['database.directory'], kineticsFamilies='all', reactionLibraries = [libraryName], kineticsDepositories='all')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## generate fam_rxn, spec replacement and get reactionDict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "reactionDict = {}\n",
+    "kineticLibrary = database.kinetics.libraries[libraryName]\n",
+    "for index, entry in kineticLibrary.entries.iteritems():\n",
+    "    lib_rxn = entry.item\n",
+    "    lib_rxn.kinetics = entry.data \n",
+    "    lib_rxn.index = entry.index\n",
+    "    lib_rxn.kinetics.comment = entry.label # Assign the entry's label to the comment\n",
+    "    # Let's make RMG try to generate this reaction from the families!\n",
+    "    fam_rxn_list = []\n",
+    "    rxt_mol_mutation_num = 1\n",
+    "    pdt_mol_mutation_num = 1\n",
+    "    for reactant in lib_rxn.reactants:\n",
+    "        rxt_mol_mutation_num *= len(reactant.molecule)\n",
+    "\n",
+    "    for product in lib_rxn.products:\n",
+    "        pdt_mol_mutation_num *= len(product.molecule)\n",
+    "\n",
+    "    for mutation_i in range(rxt_mol_mutation_num):\n",
+    "        rxts_mol = [spc.molecule[mutation_i%(len(spc.molecule))] for spc in lib_rxn.reactants]\n",
+    "        pdts_mol = [spc.molecule[0] for spc in lib_rxn.products]\n",
+    "        fam_rxn_list.extend(database.kinetics.generateReactionsFromFamilies(\n",
+    "                        reactants=rxts_mol, products=pdts_mol))\n",
+    "\n",
+    "\n",
+    "    if len(fam_rxn_list) == 1:\n",
+    "        fam_rxn = fam_rxn_list[0]\n",
+    "\n",
+    "        # danger: the fam_rxn may have switched the reactants with products\n",
+    "        # fam_rxn is survived from def filterReactions\n",
+    "        # so it's matched with lib_rxn only we have to \n",
+    "        # determine the direction\n",
+    "        lib_reactants = [r for r in lib_rxn.reactants]        \n",
+    "        fam_reactants = [r for r in fam_rxn.reactants]\n",
+    "        for lib_reactant in lib_reactants:\n",
+    "            for fam_reactant in fam_reactants:\n",
+    "                if lib_reactant.isIsomorphic(fam_reactant):\n",
+    "                    fam_reactants.remove(fam_reactant)\n",
+    "                    break\n",
+    "\n",
+    "        lib_products = [r for r in lib_rxn.products]        \n",
+    "        fam_products = [r for r in fam_rxn.products]\n",
+    "        for lib_product in lib_products:\n",
+    "            for fam_product in fam_products:\n",
+    "                if lib_product.isIsomorphic(fam_product):\n",
+    "                    fam_products.remove(fam_product)\n",
+    "                    break\n",
+    "\n",
+    "        forward = not (len(fam_reactants) != 0 or len(fam_products) != 0)\n",
+    "        # find the labeled atoms using family and reactants & products from fam_rxn           \n",
+    "        addAtomLabelsForReaction(fam_rxn, database)\n",
+    "        # species replacement so that labeledAtoms is retored\n",
+    "        if forward:\n",
+    "            lib_rxn.reactants = fam_rxn.reactants\n",
+    "            lib_rxn.products = fam_rxn.products\n",
+    "        else:\n",
+    "            lib_rxn.reactants = fam_rxn.products\n",
+    "            lib_rxn.products = fam_rxn.reactants\n",
+    "        if fam_rxn.family in reactionDict:\n",
+    "            reactionDict[fam_rxn.family].append(lib_rxn)\n",
+    "        else:\n",
+    "            reactionDict[fam_rxn.family] = [lib_rxn]\n",
+    "\n",
+    "    elif len(fam_rxn_list) == 0:\n",
+    "        print \"Sad :( There are no matches.  This is a magic reaction or has chemistry that should be made into a new reaction family\"\n",
+    "        print ''\n",
+    "        print lib_rxn\n",
+    "        print ''\n",
+    "        print 'Reactant SMILES:'\n",
+    "        for reactant in lib_rxn.reactants:\n",
+    "            print reactant.molecule[0].toSMILES()\n",
+    "        print 'Product SMILES:'\n",
+    "        for product in lib_rxn.products:\n",
+    "            print product.molecule[0].toSMILES()\n",
+    "        print '==============='\n",
+    "    else:\n",
+    "        print \"There are multiple RMG matches for this reaction. You have to manually create this training reaction\"\n",
+    "        print ''\n",
+    "        print lib_rxn\n",
+    "        print ''\n",
+    "        print 'Reactant SMILES:'\n",
+    "        for reactant in lib_rxn.reactants:\n",
+    "            print reactant.molecule[0].toSMILES()\n",
+    "        print 'Product SMILES'\n",
+    "        for product in lib_rxn.products:\n",
+    "            print product.molecule[0].toSMILES()\n",
+    "        print ''\n",
+    "        print 'The following families were matched:'\n",
+    "        for rxn in fam_rxn_list:\n",
+    "            print rxn.family\n",
+    "        print '==============='\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "for familyName in reactionDict:\n",
+    "    print 'Adding training reactions for family: ' + familyName\n",
+    "    kineticFamily = database.kinetics.families[familyName]\n",
+    "    trainingDatabase = None\n",
+    "    for depository in kineticFamily.depositories:\n",
+    "            if depository.label.endswith('training'):\n",
+    "                trainingDatabase = depository\n",
+    "                break\n",
+    "    reactions = reactionDict[familyName]\n",
+    "    print 'reactions.py previously has {} rxns. Now adding {} new rxn(s).'.format(len(trainingDatabase.entries.values()), len(reactions))\n",
+    "    print '================='\n",
+    "    kineticFamily.saveTrainingReactions(reactions, shortDesc='Training reaction from kinetics library: {0}'.format(libraryName))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# How saveTrainingReaction works\n",
+    "\n",
+    "This part of the script is commented and should not be run.  It serves only to demonstrate how the code for saving the training reactions works.\n",
+    "\n",
+    "## get speciesDict\n",
+    "\n",
+    "### load existing species as an intial speciesDict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# import os\n",
+    "# from rmgpy.data.base import Database\n",
+    "\n",
+    "# training_path = os.path.join(settings['database.directory'], \\\n",
+    "#                              'kinetics', 'families', 'R_Addition_MultipleBond', 'training')\n",
+    "\n",
+    "# dictionary_file = os.path.join(training_path, 'dictionary.txt')\n",
+    "\n",
+    "# # Load the existing set of the species of the training reactions\n",
+    "# speciesDict = Database().getSpecies(dictionary_file)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### for one family check uniqueness of each species in the lib_rxns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "# familyName = 'R_Addition_MultipleBond'\n",
+    "# print 'Adding training reactions for family: ' + familyName\n",
+    "# kineticFamily = database.kinetics.families[familyName]\n",
+    "# reactions = reactionDict[familyName]\n",
+    "\n",
+    "# for rxn in reactions:\n",
+    "#     for spec in (rxn.reactants + rxn.products):\n",
+    "#         for ex_spec_label in speciesDict:\n",
+    "#             ex_spec = speciesDict[ex_spec_label]\n",
+    "#             if ex_spec.molecule[0].getFormula() != spec.molecule[0].getFormula():\n",
+    "#                 continue\n",
+    "#             else:\n",
+    "#                 spec_labeledAtoms = spec.molecule[0].getLabeledAtoms()\n",
+    "#                 ex_spec_labeledAtoms = ex_spec.molecule[0].getLabeledAtoms()\n",
+    "#                 initialMap = {}\n",
+    "#                 try:\n",
+    "#                     for atomLabel in spec_labeledAtoms:\n",
+    "#                         initialMap[spec_labeledAtoms[atomLabel]] = ex_spec_labeledAtoms[atomLabel]\n",
+    "#                 except KeyError:\n",
+    "#                     # atom labels did not match, therefore not a match\n",
+    "#                     continue\n",
+    "#                 if spec.molecule[0].isIsomorphic(ex_spec.molecule[0],initialMap):\n",
+    "#                     spec.label = ex_spec.label\n",
+    "#                     break\n",
+    "#         else:# no isomorphic existing species found\n",
+    "#             spec_formula = spec.molecule[0].getFormula()\n",
+    "#             if spec_formula not in speciesDict:\n",
+    "#                 spec.label = spec_formula\n",
+    "#             else:\n",
+    "#                 index = 2\n",
+    "#                 while (spec_formula + '-{}'.format(index)) in speciesDict:\n",
+    "#                     index += 1\n",
+    "#                 spec.label = spec_formula + '-{}'.format(index)\n",
+    "#             speciesDict[spec.label] = spec"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## save to files\n",
+    "\n",
+    "Save reactionDict to reactions.py and speciesDict to dictionary.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# # try to append \n",
+    "# training_file = open(os.path.join(settings['database.directory'], 'kinetics', 'families', \\\n",
+    "#             kineticFamily.label, 'training', 'reactions_test.py'), 'a')\n",
+    "\n",
+    "# training_file.write(\"\\n\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# # find the largest reaction index\n",
+    "# for depository in kineticFamily.depositories:\n",
+    "#     if depository.label.endswith('training'):\n",
+    "#         break\n",
+    "# else:\n",
+    "#     logging.info('Could not find training depository in family {0}.'.format(kineticFamily.label))\n",
+    "#     logging.info('Starting a new one')\n",
+    "#     depository = KineticsDepository()\n",
+    "#     kineticFamily.depositories.append(depository)\n",
+    "\n",
+    "# trainingDatabase = depository\n",
+    "# indices = [entry.index for entry in trainingDatabase.entries.values()]\n",
+    "# if indices:\n",
+    "#     maxIndex = max(indices)\n",
+    "# else:\n",
+    "#     maxIndex = 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# # save reactions.py\n",
+    "# from rmgpy.data.base import Entry\n",
+    "# for i, reaction in enumerate(reactions):    \n",
+    "#     entry = Entry(\n",
+    "#         index = maxIndex+i+1,\n",
+    "#         label = str(reaction),\n",
+    "#         item = reaction,\n",
+    "#         data = reaction.kinetics,\n",
+    "#         reference = None,\n",
+    "#         referenceType = '',\n",
+    "#         shortDesc = unicode(''),\n",
+    "#         longDesc = unicode(''),\n",
+    "#         rank = 3,\n",
+    "#         )\n",
+    "#     print reaction\n",
+    "#     kineticFamily.saveEntry(training_file, entry)\n",
+    "\n",
+    "# training_file.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# # save dictionary.txt\n",
+    "# directory_test_file = os.path.join(training_path, 'directory_test.txt')\n",
+    "# with open(directory_test_file, 'w') as f:\n",
+    "#     for label in speciesDict.keys():\n",
+    "#         f.write(speciesDict[label].molecule[0].toAdjacencyList(label=label, removeH=False))\n",
+    "#         f.write('\\n')\n",
+    "# f.close()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}