From 9fd81a582fd0d2772fa135ab8e3633acf049b60f Mon Sep 17 00:00:00 2001 From: Connie Gao Date: Thu, 21 Jul 2016 21:58:25 -0400 Subject: [PATCH 01/16] Add the evaluateKinetics.py script to the testing folder --- testing/evaluateKinetics.py | 399 ++++++++++++++++++++++++++++++++++++ 1 file changed, 399 insertions(+) create mode 100644 testing/evaluateKinetics.py diff --git a/testing/evaluateKinetics.py b/testing/evaluateKinetics.py new file mode 100644 index 0000000000..d950ec3f7d --- /dev/null +++ b/testing/evaluateKinetics.py @@ -0,0 +1,399 @@ +#!/usr/bin/env python +# encoding: utf-8 + +""" +This script is meant to use statistical tests to evaluate the rate rules of RMG. +""" + +import os.path +import math +import numpy +import matplotlib +matplotlib.rc('mathtext', fontset='stixsans', default='regular') +import matplotlib.pyplot as plt +import pylab +import re +import copy +import csv + +from rmgpy.thermo import * +from rmgpy.kinetics import * +from rmgpy.data.reference import * +from rmgpy.data.base import Entry +from rmgpy.data.thermo import ThermoDatabase +from rmgpy.data.kinetics import saveEntry +from rmgpy.molecule import Molecule +from rmgpy.species import Species +from rmgpy.reaction import Reaction +from rmgpy.data.rmg import RMGDatabase +from rmgpy.data.kinetics.common import UndeterminableKineticsError + + +def getKineticsDepository(family, depositoryLabel, missingGroups): + + depository = None + for tempDepository in family.depositories: + if re.search(depositoryLabel, tempDepository.label): + depository=tempDepository + + family.fillKineticsRulesByAveragingUp() + + exactKinetics={} + approxKinetics={} + + for key, entry in depository.entries.iteritems(): + try: + reaction=entry.item + template=family.getReactionTemplate(reaction) + exactKinetics[key]=entry.data + approxKinetics[key]=family.rules.estimateKinetics(template)[0] + + + except UndeterminableKineticsError: + missingGroups.append([family.label, key , 'No Template found']) + + except Exception as inst: + missingGroups.append([family.label, key, 'Other error, needs further investigation']) + + return exactKinetics, approxKinetics, missingGroups + +def getKineticsLeaveOneOut(family, missingGroups): + """ + Performs the leave one out test on a family. It returns a dictionary of + the original exact nodes and a dictionary of the new averaged nodes. + The returned dictionary entries will be of a KineticModel class + """ + + entryKeys=family.rules.entries.keys() + + exactKinetics={} + approxKinetics={} + index=0 + for entryKey in entryKeys: + index+=1 +# templateKeys=re.split(';', entryKey) +# print entryKey, templateKeys, index + try: +# print entryKey + template=family.getReactionTemplate(family.rules.entries[entryKey][0].item) + + print entryKey, [templateEntry.label for templateEntry in template], index + exactKinetics[entryKey], exactKineticsEntry=family.rules.estimateKinetics(template) + + familyCopy=copy.deepcopy(family) + familyCopy.rules.entries.pop(entryKey) + familyCopy.fillKineticsRulesByAveragingUp() + + approxKinetics[entryKey], approxKineticsEntry=familyCopy.rules.estimateKinetics(template) + + except UndeterminableKineticsError: + missingGroups.append([family.label, re.split(';', entryKey), 'No Template found']) + + except Exception as inst: + missingGroups.append([family.label, re.split(';', entryKey), 'Other error, needs further investigation']) + + except AttributeError: +# if family.label in logicErrorNodes.keys(): +# logicErrorNodes[family.label].append(entryKey) +# else: +# logicErrorNodes[family.label]=[entryKey] + pass #maybe add more later + + return exactKinetics, approxKinetics, missingGroups + +#returns the average temperature for the range given by the kinetic model +def getAverageTemp(kineticModel): +# try: +# return (kineticModel.Tmin.value + kineticModel.Tmax.value)/2 +# except AttributeError: + return 1000 + +#calculates the parity values for each +def calculateParity(exactKineticModel, approxKineticModel, T): + exact = exactKineticModel.getRateCoefficient(T) + approx = approxKineticModel.getRateCoefficient(T) + + return float(approx)/float(exact) + + +def analyzeForParity(exactKinetics, approxKinetics, T=None, cutoff=0): + """ + Creates a parity plot from the exactKinetics and approxKinetics (dictionarys with + kineticModels are entries). Uses the median temperature of the exactKinetics to + give the best comparison. Returns the parityData in a dictionary with + {key: [exactCoefficient(T), approxCoefficient(T)} + """ + parityData={} + for key in approxKinetics: + if T is None: + T=getAverageTemp(exactKinetics[key]) + print exactKinetics[key] + exact=exactKinetics[key].getRateCoefficient(T) + approx=approxKinetics[key].getRateCoefficient(T) + dataPoint=[exact, approx] + if cutoff!=0 and math.log10((float(exact)/float(approx)))**2 > cutoff**2: + continue + parityData[key]=dataPoint + + return parityData + + +def calculateQ(parityData): + """ + Calculates the predicted root mean square error + """ + Q=0 + for key, value in parityData.iteritems(): + Q+=(math.log10(value[0]/value[1]))**2 + return (Q/len(parityData))**0.5 + +def createParityPlot(parityData): + + #unpack the data + keyList=parityData.keys() + xAxis=[] + yAxis=[] + for key in keyList: + xAxis.append(parityData[key][0]) + yAxis.append(parityData[key][1]) + + plt.loglog(xAxis,yAxis, 'ks') + + minimum=min(min(xAxis), min(yAxis)) + maximum=max(max(xAxis), max(yAxis)) + plt.loglog([minimum/10,maximum*10], [minimum/10,maximum*10], 'k') + plt.loglog([minimum*10,maximum*10], [minimum/10,maximum/10], 'k--') + plt.loglog([minimum/10,maximum/10],[minimum*10,maximum*10], 'k--') + plt.xlabel('Actual rate coefficient (cm^3/mol-s)') + plt.ylabel('Estimated rate coefficient (cm^3/mol-s)') + plt.axis([minimum/10, maximum*10, minimum/10, maximum*10]) + +def countNodes(family): + countList=[family.label] + + #get top nodes + forwardTemplate = family.groups.top[:] + + temporary = [] + symmetricTree = False + for entry in forwardTemplate: + if entry not in temporary: + temporary.append(entry) + else: + # duplicate node found at top of tree + # eg. R_recombination: ['Y_rad', 'Y_rad'] + assert len(forwardTemplate)==2 , 'Can currently only do symmetric trees with nothing else in them' + symmetricTree = True + forwardTemplate = temporary + + for group in forwardTemplate: + checkList=[group] + childrenList=[group] + while len(checkList)>0: + childrenList.extend(checkList[0].children) + checkList.extend(checkList[0].children) + del checkList[0] + + countList.append(len(childrenList)) + return countList + + +def getKineticsFromRules(family, entryKey): + + + family.addKineticsRulesFromTrainingSet(thermoDatabase=FullDatabase.thermo) + family.fillKineticsRulesByAveragingUp() + + entryKeys=re.split(';', entryKey) + + template=[] + for key in entryKeys: + template.append(family.groups.entries[key]) + + return family.rules.estimateKinetics(template) + + +########################################################################################################### +# Functions for the full Database + +def countNodesAll(FullDatabase, trialDir): + for family in FullDatabase.kinetics.families.values(): + family.addKineticsRulesFromTrainingSet(thermoDatabase=FullDatabase.thermo) + + allFamilyNames=FullDatabase.kinetics.families.keys() + + familyCount={} + + for familyName in allFamilyNames: + family=FullDatabase.kinetics.families[familyName] + print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' nodes)' + familyCount[familyName]=countNodes(family) + + with open(os.path.join(trialDir, 'NodeCount.csv'), 'wb') as csvfile: + csvwriter=csv.writer(csvfile) + for key, value in familyCount.iteritems(): + csvwriter.writerow(value) + + +def NISTExact(FullDatabase, trialDir): + + trialDir=os.path.join(trialDir, 'NISTExact') + if not os.path.exists(trialDir): + os.makedirs(trialDir) + + for family in FullDatabase.kinetics.families.values(): + family.addKineticsRulesFromTrainingSet(thermoDatabase=FullDatabase.thermo) + + + allFamilyNames=FullDatabase.kinetics.families.keys() +# familyName='Disproportionation' +# allFamilyNames=[familyName] + + missingGroups=[] + QDict={} + familiesWithErrors=[] + for familyName in allFamilyNames: + family=FullDatabase.kinetics.families[familyName] + print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' nodes)' + if len(family.rules.entries) < 2: + print ' Skipping', familyName, ': only has one node...' + else: + ##getKineticsDepository + exactKinetics, approxKinetics, missingGroups=getKineticsDepository(family, 'NIST', missingGroups) + + #prune for exact matches only + keysToRemove=[] + for key, kinetics in approxKinetics.iteritems(): + if not re.search('Exact', kinetics.comment): + keysToRemove.append(key) + + for key in keysToRemove: + del approxKinetics[key] + + try: + parityData=analyzeForParity(exactKinetics, approxKinetics, None, 8) + except KeyError: + familiesWithErrors.append(family.label) + continue + if len(parityData)<2: + print ' Skipping', familyName, ': only one node was calculated...' + continue + QDict[familyName]=calculateQ(parityData) + createParityPlot(parityData) + plt.title(familyName) + plt.savefig(os.path.join(trialDir, familyName +'.png')) + plt.clf() + + if not os.path.exists(os.path.join(os.path.join(trialDir, 'ParityData'))): + os.makedirs(os.path.join(trialDir, 'ParityData')) + + with open(os.path.join(trialDir, 'ParityData', familyName + '.csv'), 'wb') as csvfile: + csvwriter=csv.writer(csvfile) + for key, value in parityData.iteritems(): + csvwriter.writerow([key, value[0]/value[1], approxKinetics[key].comment]) + + with open(os.path.join(trialDir, 'QDict_LOO.csv'), 'wb') as csvfile: + csvwriter=csv.writer(csvfile) + for key, value in QDict.iteritems(): + csvwriter.writerow([key, value]) + + with open(os.path.join(trialDir, 'missingNodes.csv'), 'wb') as csvfile: + csvwriter=csv.writer(csvfile) + for missingNode in missingGroups: + csvwriter.writerow(missingNode) + + print 'These families had errors:', familiesWithErrors + + + +def leaveOneOut(FullDatabase, trialDir): + """ + Performs leave one out analysis on all the kinetics families. + """ + + trialDir=os.path.join(trialDir, 'LeaveOneOut') + if not os.path.exists(trialDir): + os.makedirs(trialDir) + + for family in FullDatabase.kinetics.families.values(): + family.addKineticsRulesFromTrainingSet(thermoDatabase=FullDatabase.thermo) + +# familyName='intra_substitutionCS_isomerization' +# allFamilyNames=[familyName] + allFamilyNames=FullDatabase.kinetics.families.keys() + + missingGroups=[] + QDict={} + familiesWithErrors=[] + for familyName in allFamilyNames: + family=FullDatabase.kinetics.families[familyName] + print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' nodes)' + if len(family.rules.entries) < 2: + print ' Skipping', familyName, ': only has one node...' + else: + ##getKineticsLeaveOneOut + exactKinetics, approxKinetics, missingGroups=getKineticsLeaveOneOut(family, missingGroups) + try: + parityData=analyzeForParity(exactKinetics, approxKinetics, None, 8) + except KeyError: + familiesWithErrors.append(family.label) + continue + if len(parityData)<2: + print ' Skipping', familyName, ': only one node was calculated...' + continue + QDict[familyName]=calculateQ(parityData) + createParityPlot(parityData) + plt.title(familyName) + plt.savefig(os.path.join(trialDir, familyName +'.png')) + plt.clf() + + if not os.path.exists(os.path.join(os.path.join(trialDir, 'ParityData'))): + os.makedirs(os.path.join(trialDir, 'ParityData')) + + with open(os.path.join(trialDir, 'ParityData', familyName + '.csv'), 'wb') as csvfile: + csvwriter=csv.writer(csvfile) + for key, value in parityData.iteritems(): + csvwriter.writerow([key, value[0]/value[1], approxKinetics[key].comment]) + + with open(os.path.join(trialDir, 'QDict_LOO.csv'), 'wb') as csvfile: + csvwriter=csv.writer(csvfile) + for key, value in QDict.iteritems(): + csvwriter.writerow([key, value]) + + with open(os.path.join(trialDir, 'missingNodes.csv'), 'wb') as csvfile: + csvwriter=csv.writer(csvfile) + for missingNode in missingGroups: + csvwriter.writerow(missingNode) + + print 'These families had errors:', familiesWithErrors + return + + +if __name__ == '__main__': + from rmgpy import settings + print 'Loading the RMG database...' + FullDatabase=RMGDatabase() + FullDatabase.load(settings['database.directory'], + kineticsFamilies=['Disproportionation'], + kineticsDepositories='all', + thermoLibraries=[], + reactionLibraries=[], + ) + + trialDir = os.path.join(settings['database.directory'],'..','testing','eval') + if not os.path.exists(trialDir): + os.makedirs(trialDir) + + family=FullDatabase.kinetics.families['Disproportionation'] + print family.depositories + print FullDatabase.thermo.libraries + entryKey='Y_1centerbirad;O_Cdrad' + + retrievedKinetics, retrievedEntry = getKineticsFromRules(family, entryKey) + + print retrievedKinetics + + NISTExact(FullDatabase, trialDir) + countNodesAll(FullDatabase, trialDir) + leaveOneOut(FullDatabase, trialDir) + \ No newline at end of file From 7adbe256b788ebfcd26dbae95b324636bf8632bc Mon Sep 17 00:00:00 2001 From: Connie Gao Date: Thu, 21 Jul 2016 21:59:10 -0400 Subject: [PATCH 02/16] Gitignore kinetics evaluation folder --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 76ecb493ad..a94eafd8a8 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,7 @@ # C-Compiled scripts *.pyc -*.ipynb_checkpoints \ No newline at end of file +*.ipynb_checkpoints + +# evaluate kinetics folder stuff +testing/eval/* \ No newline at end of file From 878c4ef0cd96a56f694461bc80ca0e57e0f2c29d Mon Sep 17 00:00:00 2001 From: Connie Gao Date: Thu, 21 Jul 2016 22:05:09 -0400 Subject: [PATCH 03/16] Get rid of try/except blocks top see what is goin on --- testing/evaluateKinetics.py | 108 ++++++++---------------------------- 1 file changed, 22 insertions(+), 86 deletions(-) diff --git a/testing/evaluateKinetics.py b/testing/evaluateKinetics.py index d950ec3f7d..243bd0295f 100644 --- a/testing/evaluateKinetics.py +++ b/testing/evaluateKinetics.py @@ -42,20 +42,12 @@ def getKineticsDepository(family, depositoryLabel, missingGroups): approxKinetics={} for key, entry in depository.entries.iteritems(): - try: - reaction=entry.item - template=family.getReactionTemplate(reaction) - exactKinetics[key]=entry.data - approxKinetics[key]=family.rules.estimateKinetics(template)[0] - - - except UndeterminableKineticsError: - missingGroups.append([family.label, key , 'No Template found']) - - except Exception as inst: - missingGroups.append([family.label, key, 'Other error, needs further investigation']) + reaction=entry.item + template=family.getReactionTemplate(reaction) + exactKinetics[key]=entry.data + approxKinetics[key]=family.rules.estimateKinetics(template)[0] - return exactKinetics, approxKinetics, missingGroups + return exactKinetics, approxKinetics def getKineticsLeaveOneOut(family, missingGroups): """ @@ -71,42 +63,19 @@ def getKineticsLeaveOneOut(family, missingGroups): index=0 for entryKey in entryKeys: index+=1 -# templateKeys=re.split(';', entryKey) -# print entryKey, templateKeys, index - try: -# print entryKey - template=family.getReactionTemplate(family.rules.entries[entryKey][0].item) - - print entryKey, [templateEntry.label for templateEntry in template], index - exactKinetics[entryKey], exactKineticsEntry=family.rules.estimateKinetics(template) - - familyCopy=copy.deepcopy(family) - familyCopy.rules.entries.pop(entryKey) - familyCopy.fillKineticsRulesByAveragingUp() - - approxKinetics[entryKey], approxKineticsEntry=familyCopy.rules.estimateKinetics(template) - - except UndeterminableKineticsError: - missingGroups.append([family.label, re.split(';', entryKey), 'No Template found']) + template=family.getReactionTemplate(family.rules.entries[entryKey][0].item) + + print entryKey, [templateEntry.label for templateEntry in template], index + exactKinetics[entryKey], exactKineticsEntry=family.rules.estimateKinetics(template) - except Exception as inst: - missingGroups.append([family.label, re.split(';', entryKey), 'Other error, needs further investigation']) + familyCopy=copy.deepcopy(family) + familyCopy.rules.entries.pop(entryKey) + familyCopy.fillKineticsRulesByAveragingUp() - except AttributeError: -# if family.label in logicErrorNodes.keys(): -# logicErrorNodes[family.label].append(entryKey) -# else: -# logicErrorNodes[family.label]=[entryKey] - pass #maybe add more later - - return exactKinetics, approxKinetics, missingGroups - -#returns the average temperature for the range given by the kinetic model -def getAverageTemp(kineticModel): -# try: -# return (kineticModel.Tmin.value + kineticModel.Tmax.value)/2 -# except AttributeError: - return 1000 + approxKinetics[entryKey], approxKineticsEntry=familyCopy.rules.estimateKinetics(template) + + return exactKinetics, approxKinetics + #calculates the parity values for each def calculateParity(exactKineticModel, approxKineticModel, T): @@ -116,7 +85,7 @@ def calculateParity(exactKineticModel, approxKineticModel, T): return float(approx)/float(exact) -def analyzeForParity(exactKinetics, approxKinetics, T=None, cutoff=0): +def analyzeForParity(exactKinetics, approxKinetics, T=1000, cutoff=0): """ Creates a parity plot from the exactKinetics and approxKinetics (dictionarys with kineticModels are entries). Uses the median temperature of the exactKinetics to @@ -125,9 +94,7 @@ def analyzeForParity(exactKinetics, approxKinetics, T=None, cutoff=0): """ parityData={} for key in approxKinetics: - if T is None: - T=getAverageTemp(exactKinetics[key]) - print exactKinetics[key] + exact=exactKinetics[key].getRateCoefficient(T) approx=approxKinetics[key].getRateCoefficient(T) dataPoint=[exact, approx] @@ -198,21 +165,6 @@ def countNodes(family): return countList -def getKineticsFromRules(family, entryKey): - - - family.addKineticsRulesFromTrainingSet(thermoDatabase=FullDatabase.thermo) - family.fillKineticsRulesByAveragingUp() - - entryKeys=re.split(';', entryKey) - - template=[] - for key in entryKeys: - template.append(family.groups.entries[key]) - - return family.rules.estimateKinetics(template) - - ########################################################################################################### # Functions for the full Database @@ -270,11 +222,8 @@ def NISTExact(FullDatabase, trialDir): for key in keysToRemove: del approxKinetics[key] - try: - parityData=analyzeForParity(exactKinetics, approxKinetics, None, 8) - except KeyError: - familiesWithErrors.append(family.label) - continue + parityData=analyzeForParity(exactKinetics, approxKinetics, None, 8) + if len(parityData)<2: print ' Skipping', familyName, ': only one node was calculated...' continue @@ -331,13 +280,9 @@ def leaveOneOut(FullDatabase, trialDir): if len(family.rules.entries) < 2: print ' Skipping', familyName, ': only has one node...' else: - ##getKineticsLeaveOneOut exactKinetics, approxKinetics, missingGroups=getKineticsLeaveOneOut(family, missingGroups) - try: - parityData=analyzeForParity(exactKinetics, approxKinetics, None, 8) - except KeyError: - familiesWithErrors.append(family.label) - continue + parityData=analyzeForParity(exactKinetics, approxKinetics, None, 8) + if len(parityData)<2: print ' Skipping', familyName, ': only one node was calculated...' continue @@ -383,15 +328,6 @@ def leaveOneOut(FullDatabase, trialDir): trialDir = os.path.join(settings['database.directory'],'..','testing','eval') if not os.path.exists(trialDir): os.makedirs(trialDir) - - family=FullDatabase.kinetics.families['Disproportionation'] - print family.depositories - print FullDatabase.thermo.libraries - entryKey='Y_1centerbirad;O_Cdrad' - - retrievedKinetics, retrievedEntry = getKineticsFromRules(family, entryKey) - - print retrievedKinetics NISTExact(FullDatabase, trialDir) countNodesAll(FullDatabase, trialDir) From 2984803c6f57c2171934d4f34f0f920b9d791a26 Mon Sep 17 00:00:00 2001 From: Connie Gao Date: Thu, 21 Jul 2016 22:33:04 -0400 Subject: [PATCH 04/16] Deleted missingGroups attribute. Updated such that this works on current rmg-py --- testing/evaluateKinetics.py | 63 ++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/testing/evaluateKinetics.py b/testing/evaluateKinetics.py index 243bd0295f..9afb44e8a2 100644 --- a/testing/evaluateKinetics.py +++ b/testing/evaluateKinetics.py @@ -29,7 +29,7 @@ from rmgpy.data.kinetics.common import UndeterminableKineticsError -def getKineticsDepository(family, depositoryLabel, missingGroups): +def getKineticsDepository(family, depositoryLabel): depository = None for tempDepository in family.depositories: @@ -42,14 +42,18 @@ def getKineticsDepository(family, depositoryLabel, missingGroups): approxKinetics={} for key, entry in depository.entries.iteritems(): - reaction=entry.item - template=family.getReactionTemplate(reaction) - exactKinetics[key]=entry.data - approxKinetics[key]=family.rules.estimateKinetics(template)[0] + try: + reaction=entry.item + template=family.getReactionTemplate(reaction) + exactKinetics[key]=entry.data + approxKinetics[key]=family.rules.estimateKinetics(template)[0] + except Exception as e: + print entry.item + print e return exactKinetics, approxKinetics -def getKineticsLeaveOneOut(family, missingGroups): +def getKineticsLeaveOneOut(family): """ Performs the leave one out test on a family. It returns a dictionary of the original exact nodes and a dictionary of the new averaged nodes. @@ -63,9 +67,9 @@ def getKineticsLeaveOneOut(family, missingGroups): index=0 for entryKey in entryKeys: index+=1 - template=family.getReactionTemplate(family.rules.entries[entryKey][0].item) - - print entryKey, [templateEntry.label for templateEntry in template], index + + nodes = entryKey.split(';') + template = [family.groups.entries[node] for node in nodes] exactKinetics[entryKey], exactKineticsEntry=family.rules.estimateKinetics(template) familyCopy=copy.deepcopy(family) @@ -77,15 +81,18 @@ def getKineticsLeaveOneOut(family, missingGroups): return exactKinetics, approxKinetics -#calculates the parity values for each + def calculateParity(exactKineticModel, approxKineticModel, T): + ''' + Calculates the parity values between two sets of kinetics evaluated at temperature T + ''' exact = exactKineticModel.getRateCoefficient(T) approx = approxKineticModel.getRateCoefficient(T) return float(approx)/float(exact) -def analyzeForParity(exactKinetics, approxKinetics, T=1000, cutoff=0): +def analyzeForParity(exactKinetics, approxKinetics, T=1000.0, cutoff=0.0): """ Creates a parity plot from the exactKinetics and approxKinetics (dictionarys with kineticModels are entries). Uses the median temperature of the exactKinetics to @@ -109,7 +116,7 @@ def calculateQ(parityData): """ Calculates the predicted root mean square error """ - Q=0 + Q=0.0 for key, value in parityData.iteritems(): Q+=(math.log10(value[0]/value[1]))**2 return (Q/len(parityData))**0.5 @@ -201,17 +208,15 @@ def NISTExact(FullDatabase, trialDir): # familyName='Disproportionation' # allFamilyNames=[familyName] - missingGroups=[] QDict={} - familiesWithErrors=[] + for familyName in allFamilyNames: family=FullDatabase.kinetics.families[familyName] print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' nodes)' if len(family.rules.entries) < 2: print ' Skipping', familyName, ': only has one node...' else: - ##getKineticsDepository - exactKinetics, approxKinetics, missingGroups=getKineticsDepository(family, 'NIST', missingGroups) + exactKinetics, approxKinetics =getKineticsDepository(family, 'NIST') #prune for exact matches only keysToRemove=[] @@ -222,7 +227,7 @@ def NISTExact(FullDatabase, trialDir): for key in keysToRemove: del approxKinetics[key] - parityData=analyzeForParity(exactKinetics, approxKinetics, None, 8) + parityData=analyzeForParity(exactKinetics, approxKinetics, cutoff=8.0) if len(parityData)<2: print ' Skipping', familyName, ': only one node was calculated...' @@ -246,12 +251,7 @@ def NISTExact(FullDatabase, trialDir): for key, value in QDict.iteritems(): csvwriter.writerow([key, value]) - with open(os.path.join(trialDir, 'missingNodes.csv'), 'wb') as csvfile: - csvwriter=csv.writer(csvfile) - for missingNode in missingGroups: - csvwriter.writerow(missingNode) - print 'These families had errors:', familiesWithErrors @@ -271,17 +271,16 @@ def leaveOneOut(FullDatabase, trialDir): # allFamilyNames=[familyName] allFamilyNames=FullDatabase.kinetics.families.keys() - missingGroups=[] QDict={} - familiesWithErrors=[] + for familyName in allFamilyNames: family=FullDatabase.kinetics.families[familyName] print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' nodes)' if len(family.rules.entries) < 2: print ' Skipping', familyName, ': only has one node...' else: - exactKinetics, approxKinetics, missingGroups=getKineticsLeaveOneOut(family, missingGroups) - parityData=analyzeForParity(exactKinetics, approxKinetics, None, 8) + exactKinetics, approxKinetics =getKineticsLeaveOneOut(family) + parityData=analyzeForParity(exactKinetics, approxKinetics, cutoff=8.0) if len(parityData)<2: print ' Skipping', familyName, ': only one node was calculated...' @@ -304,13 +303,7 @@ def leaveOneOut(FullDatabase, trialDir): csvwriter=csv.writer(csvfile) for key, value in QDict.iteritems(): csvwriter.writerow([key, value]) - - with open(os.path.join(trialDir, 'missingNodes.csv'), 'wb') as csvfile: - csvwriter=csv.writer(csvfile) - for missingNode in missingGroups: - csvwriter.writerow(missingNode) - print 'These families had errors:', familiesWithErrors return @@ -328,8 +321,12 @@ def leaveOneOut(FullDatabase, trialDir): trialDir = os.path.join(settings['database.directory'],'..','testing','eval') if not os.path.exists(trialDir): os.makedirs(trialDir) - + print 'Evaluating the NIST Kinetics against the RMG estimates...' NISTExact(FullDatabase, trialDir) + + print 'Counting the rate rules in the families...' countNodesAll(FullDatabase, trialDir) + + print 'Performing the leave on out test on the kinetics families...' leaveOneOut(FullDatabase, trialDir) \ No newline at end of file From c47a198ed67370109bb6a1185fad8914a4503236 Mon Sep 17 00:00:00 2001 From: Connie Gao Date: Fri, 22 Jul 2016 17:08:11 -0400 Subject: [PATCH 05/16] Fix compareNIST function Raise errors if depository isn't found. Remove averaging and adding of training reactions in the family, assumes you have done it outside the function. Compares reverse kinetics if the NIST reaction is in the reverse direction from the family template --- testing/evaluateKinetics.py | 73 ++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 29 deletions(-) diff --git a/testing/evaluateKinetics.py b/testing/evaluateKinetics.py index 9afb44e8a2..a810d9cc31 100644 --- a/testing/evaluateKinetics.py +++ b/testing/evaluateKinetics.py @@ -9,9 +9,8 @@ import math import numpy import matplotlib -matplotlib.rc('mathtext', fontset='stixsans', default='regular') -import matplotlib.pyplot as plt -import pylab +matplotlib.use('Agg') +from matplotlib import pyplot as plt import re import copy import csv @@ -28,15 +27,22 @@ from rmgpy.data.rmg import RMGDatabase from rmgpy.data.kinetics.common import UndeterminableKineticsError - -def getKineticsDepository(family, depositoryLabel): +def getKineticsDepository(FullDatabase, family, depositoryLabel): + """ + Retrieve the NIST exact and approximated kinetics for those same reactions from RMG. + Note: does NOT average up the database or create any rate rules from training data. + If that is desired it must be done prior to entering this function. + """ depository = None for tempDepository in family.depositories: if re.search(depositoryLabel, tempDepository.label): depository=tempDepository + break + else: + print 'Depository {} not found in {} family.'.format(depositoryLabel, family.label) + return - family.fillKineticsRulesByAveragingUp() exactKinetics={} approxKinetics={} @@ -47,9 +53,25 @@ def getKineticsDepository(family, depositoryLabel): template=family.getReactionTemplate(reaction) exactKinetics[key]=entry.data approxKinetics[key]=family.rules.estimateKinetics(template)[0] - except Exception as e: - print entry.item - print e + except UndeterminableKineticsError: + # See if the reaction was written in the reverse direction + reaction = Reaction(reactants = copy.deepcopy(entry.item.products), + products = copy.deepcopy(entry.item.reactants), + kinetics = copy.deepcopy(entry.data) + ) + + template=family.getReactionTemplate(reaction) + + # Getting thermo data erases the atomLabels, so do this after finding the template + # But we need it for setting the reverse kinetics + for spec in reaction.reactants + reaction.products: + spec.getThermoData() + + reverseKinetics = reaction.generateReverseRateCoefficient() + reaction.kinetics = reverseKinetics + + exactKinetics[key]=reaction.kinetics + approxKinetics[key]=family.rules.estimateKinetics(template)[0] return exactKinetics, approxKinetics @@ -59,17 +81,11 @@ def getKineticsLeaveOneOut(family): the original exact nodes and a dictionary of the new averaged nodes. The returned dictionary entries will be of a KineticModel class """ - - entryKeys=family.rules.entries.keys() - exactKinetics={} approxKinetics={} - index=0 - for entryKey in entryKeys: - index+=1 - - nodes = entryKey.split(';') - template = [family.groups.entries[node] for node in nodes] + + for entryKey in family.rules.entries.keys(): + template = family.retrieveTemplate(entryKey) exactKinetics[entryKey], exactKineticsEntry=family.rules.estimateKinetics(template) familyCopy=copy.deepcopy(family) @@ -123,7 +139,6 @@ def calculateQ(parityData): def createParityPlot(parityData): - #unpack the data keyList=parityData.keys() xAxis=[] yAxis=[] @@ -194,19 +209,19 @@ def countNodesAll(FullDatabase, trialDir): csvwriter.writerow(value) -def NISTExact(FullDatabase, trialDir): - - trialDir=os.path.join(trialDir, 'NISTExact') +def compareNIST(FullDatabase, trialDir): + """ + Compare NIST reaction kinetics with estimates from RMG. Creates parity plot and + calculates the predicted root mean square error from the families. + Note: does NOT average up the database or create any rate rules from training data. + If that is desired it must be done prior to entering this function. + """ + trialDir=os.path.join(trialDir, 'compareNIST') if not os.path.exists(trialDir): os.makedirs(trialDir) - for family in FullDatabase.kinetics.families.values(): - family.addKineticsRulesFromTrainingSet(thermoDatabase=FullDatabase.thermo) - allFamilyNames=FullDatabase.kinetics.families.keys() -# familyName='Disproportionation' -# allFamilyNames=[familyName] QDict={} @@ -216,7 +231,7 @@ def NISTExact(FullDatabase, trialDir): if len(family.rules.entries) < 2: print ' Skipping', familyName, ': only has one node...' else: - exactKinetics, approxKinetics =getKineticsDepository(family, 'NIST') + exactKinetics, approxKinetics = getKineticsDepository(FullDatabase, family, 'NIST') #prune for exact matches only keysToRemove=[] @@ -322,7 +337,7 @@ def leaveOneOut(FullDatabase, trialDir): if not os.path.exists(trialDir): os.makedirs(trialDir) print 'Evaluating the NIST Kinetics against the RMG estimates...' - NISTExact(FullDatabase, trialDir) + compareNIST(FullDatabase, trialDir) print 'Counting the rate rules in the families...' countNodesAll(FullDatabase, trialDir) From 498113d53e5416321a82ca7537298fef136dae9d Mon Sep 17 00:00:00 2001 From: Connie Gao Date: Fri, 22 Jul 2016 17:18:09 -0400 Subject: [PATCH 06/16] Fix the obtainKineticsFamilyStatistics retrieves data involving the family Now shows the top node names, and also the number of rules csv file has headers now. Does not average the families or add training data. Assumes any type of manipulation has been done outside the folder. Also, don't load any kineticsLibraries or thermoLibraries by default except the primaryThermoLibrary. But do load all the depositories --- testing/evaluateKinetics.py | 61 ++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/testing/evaluateKinetics.py b/testing/evaluateKinetics.py index a810d9cc31..ed04fa9ab1 100644 --- a/testing/evaluateKinetics.py +++ b/testing/evaluateKinetics.py @@ -158,23 +158,28 @@ def createParityPlot(parityData): plt.axis([minimum/10, maximum*10, minimum/10, maximum*10]) def countNodes(family): + """ + Count the number of groups under each tree in the Family's Groups. + Returns a list containing the following information + [Family Label, Number of Rules, Top Node Label 1, Number of Children, ..., Top Node Label N, Number of Children] + """ countList=[family.label] #get top nodes forwardTemplate = family.groups.top[:] temporary = [] - symmetricTree = False for entry in forwardTemplate: if entry not in temporary: temporary.append(entry) else: - # duplicate node found at top of tree + # Duplicate node found at top of tree # eg. R_recombination: ['Y_rad', 'Y_rad'] assert len(forwardTemplate)==2 , 'Can currently only do symmetric trees with nothing else in them' - symmetricTree = True + forwardTemplate = temporary + countList.append(len(family.rules.entries)) for group in forwardTemplate: checkList=[group] childrenList=[group] @@ -183,28 +188,35 @@ def countNodes(family): checkList.extend(checkList[0].children) del checkList[0] - countList.append(len(childrenList)) + countList.extend([group.label, len(childrenList)]) + return countList ########################################################################################################### # Functions for the full Database -def countNodesAll(FullDatabase, trialDir): - for family in FullDatabase.kinetics.families.values(): - family.addKineticsRulesFromTrainingSet(thermoDatabase=FullDatabase.thermo) - +def obtainKineticsFamilyStatistics(FullDatabase, trialDir): + """ + Obtains statistics for the kinetics families by creating + a FamilyStatistics.csv file that gives information about each family: the total number of + rules, and the top node names and the number of groups under each. + Note: does NOT average up the database or create any rate rules from training data. + If that is desired it must be done prior to entering this function. (averaging may not be desired + as it would add non-exact rules to the rule count) + """ allFamilyNames=FullDatabase.kinetics.families.keys() familyCount={} for familyName in allFamilyNames: family=FullDatabase.kinetics.families[familyName] - print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' nodes)' + print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' rules)' familyCount[familyName]=countNodes(family) - - with open(os.path.join(trialDir, 'NodeCount.csv'), 'wb') as csvfile: + + with open(os.path.join(trialDir, 'FamilyStatistics.csv'), 'wb') as csvfile: csvwriter=csv.writer(csvfile) + csvwriter.writerow(['Family','Number of Rules', 'Top Node 1', 'Number of Groups', 'Top Node 2', 'Number of Groups', 'Top Node 3', 'Number of Groups']) for key, value in familyCount.iteritems(): csvwriter.writerow(value) @@ -324,23 +336,36 @@ def leaveOneOut(FullDatabase, trialDir): if __name__ == '__main__': from rmgpy import settings + + # Create the data evaluation directory + trialDir = os.path.join(settings['database.directory'],'..','testing','eval') + if not os.path.exists(trialDir): + os.makedirs(trialDir) + print 'Loading the RMG database...' FullDatabase=RMGDatabase() FullDatabase.load(settings['database.directory'], - kineticsFamilies=['Disproportionation'], + kineticsFamilies=['intra_H_migration'], kineticsDepositories='all', - thermoLibraries=[], + thermoLibraries=['primaryThermoLibrary'], # Use just the primary thermo library, which contains necessary small molecular thermo reactionLibraries=[], ) - trialDir = os.path.join(settings['database.directory'],'..','testing','eval') - if not os.path.exists(trialDir): - os.makedirs(trialDir) + # Prepare the database by loading training reactions + for family in FullDatabase.kinetics.families.values(): + family.addKineticsRulesFromTrainingSet(thermoDatabase=FullDatabase.thermo) + + print 'Obtaining statistics for the families...' + obtainKineticsFamilyStatistics(FullDatabase, trialDir) + + # Fill in the rate rules by averaging when we are ready to retrieve kinetics + for family in FullDatabase.kinetics.families.values(): + family.fillKineticsRulesByAveragingUp() + + print 'Evaluating the NIST Kinetics against the RMG estimates...' compareNIST(FullDatabase, trialDir) - print 'Counting the rate rules in the families...' - countNodesAll(FullDatabase, trialDir) print 'Performing the leave on out test on the kinetics families...' leaveOneOut(FullDatabase, trialDir) From f255c7f0f57b7a34c55899e4d8fccf0aabf254b0 Mon Sep 17 00:00:00 2001 From: Connie Gao Date: Fri, 22 Jul 2016 17:26:09 -0400 Subject: [PATCH 07/16] Change node description to rate rule because that's what it is --- testing/evaluateKinetics.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/testing/evaluateKinetics.py b/testing/evaluateKinetics.py index ed04fa9ab1..66ca763f20 100644 --- a/testing/evaluateKinetics.py +++ b/testing/evaluateKinetics.py @@ -211,7 +211,7 @@ def obtainKineticsFamilyStatistics(FullDatabase, trialDir): for familyName in allFamilyNames: family=FullDatabase.kinetics.families[familyName] - print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' rules)' + print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' rate rules)' familyCount[familyName]=countNodes(family) with open(os.path.join(trialDir, 'FamilyStatistics.csv'), 'wb') as csvfile: @@ -239,9 +239,9 @@ def compareNIST(FullDatabase, trialDir): for familyName in allFamilyNames: family=FullDatabase.kinetics.families[familyName] - print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' nodes)' + print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' rate rules)' if len(family.rules.entries) < 2: - print ' Skipping', familyName, ': only has one node...' + print ' Skipping', familyName, ': only has one rate rule...' else: exactKinetics, approxKinetics = getKineticsDepository(FullDatabase, family, 'NIST') @@ -257,7 +257,7 @@ def compareNIST(FullDatabase, trialDir): parityData=analyzeForParity(exactKinetics, approxKinetics, cutoff=8.0) if len(parityData)<2: - print ' Skipping', familyName, ': only one node was calculated...' + print ' Skipping', familyName, ': only one rate rule was calculated...' continue QDict[familyName]=calculateQ(parityData) createParityPlot(parityData) @@ -302,15 +302,15 @@ def leaveOneOut(FullDatabase, trialDir): for familyName in allFamilyNames: family=FullDatabase.kinetics.families[familyName] - print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' nodes)' + print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' rate rules)' if len(family.rules.entries) < 2: - print ' Skipping', familyName, ': only has one node...' + print ' Skipping', familyName, ': only has one rate rule...' else: - exactKinetics, approxKinetics =getKineticsLeaveOneOut(family) + exactKinetics, approxKinetics = getKineticsLeaveOneOut(family) parityData=analyzeForParity(exactKinetics, approxKinetics, cutoff=8.0) if len(parityData)<2: - print ' Skipping', familyName, ': only one node was calculated...' + print ' Skipping', familyName, ': only one rate rule was calculated...' continue QDict[familyName]=calculateQ(parityData) createParityPlot(parityData) From f0db6ff6adcf612761f225d13a0ac6486c085e4e Mon Sep 17 00:00:00 2001 From: Connie Gao Date: Fri, 22 Jul 2016 17:40:00 -0400 Subject: [PATCH 08/16] Fix leaveOneOut test and add some time stamping for the leaveOneTest --- testing/evaluateKinetics.py | 42 ++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/testing/evaluateKinetics.py b/testing/evaluateKinetics.py index 66ca763f20..bc4783e5e7 100644 --- a/testing/evaluateKinetics.py +++ b/testing/evaluateKinetics.py @@ -14,6 +14,7 @@ import re import copy import csv +from time import time from rmgpy.thermo import * from rmgpy.kinetics import * @@ -80,12 +81,17 @@ def getKineticsLeaveOneOut(family): Performs the leave one out test on a family. It returns a dictionary of the original exact nodes and a dictionary of the new averaged nodes. The returned dictionary entries will be of a KineticModel class + It deletes a single entry in the family, and then re-averages the tree + and then tries to re-estimate that original deleted entry. + + The original family should not contained averaged nodes when starting out. The + leave one out test should be performed only for original exact matches. """ exactKinetics={} approxKinetics={} for entryKey in family.rules.entries.keys(): - template = family.retrieveTemplate(entryKey) + template = family.retrieveTemplate(entryKey.split(';')) exactKinetics[entryKey], exactKineticsEntry=family.rules.estimateKinetics(template) familyCopy=copy.deepcopy(family) @@ -285,17 +291,19 @@ def compareNIST(FullDatabase, trialDir): def leaveOneOut(FullDatabase, trialDir): """ Performs leave one out analysis on all the kinetics families. + The algorithm deletes a single entry in the family, and then re-averages the tree + and then tries to re-estimate that original deleted entry. The difference between + these values is used to create a parity plot and averaged mean squared error statistics. + + Note: training data and averaging of the database is not performed at the beginning of + this function, and must be performed outside the function. Averaging the trees should not + be performed so as to not perform the leave one out test on rate rules that were averaged. """ - trialDir=os.path.join(trialDir, 'LeaveOneOut') + trialDir=os.path.join(trialDir, 'leaveOneOut') if not os.path.exists(trialDir): os.makedirs(trialDir) - for family in FullDatabase.kinetics.families.values(): - family.addKineticsRulesFromTrainingSet(thermoDatabase=FullDatabase.thermo) - -# familyName='intra_substitutionCS_isomerization' -# allFamilyNames=[familyName] allFamilyNames=FullDatabase.kinetics.families.keys() QDict={} @@ -306,7 +314,12 @@ def leaveOneOut(FullDatabase, trialDir): if len(family.rules.entries) < 2: print ' Skipping', familyName, ': only has one rate rule...' else: + + start_time = time() exactKinetics, approxKinetics = getKineticsLeaveOneOut(family) + end_time = time() + time_taken = end_time - start_time + print "Time spent: {0:.2f} minutes".format(time_taken/60.0) parityData=analyzeForParity(exactKinetics, approxKinetics, cutoff=8.0) if len(parityData)<2: @@ -345,28 +358,27 @@ def leaveOneOut(FullDatabase, trialDir): print 'Loading the RMG database...' FullDatabase=RMGDatabase() FullDatabase.load(settings['database.directory'], - kineticsFamilies=['intra_H_migration'], + kineticsFamilies=['Cyclic_Ether_Formation'], kineticsDepositories='all', thermoLibraries=['primaryThermoLibrary'], # Use just the primary thermo library, which contains necessary small molecular thermo reactionLibraries=[], ) - # Prepare the database by loading training reactions + # Prepare the database by loading training reactions but not averaging the rate rules for family in FullDatabase.kinetics.families.values(): family.addKineticsRulesFromTrainingSet(thermoDatabase=FullDatabase.thermo) - + print 'Obtaining statistics for the families...' obtainKineticsFamilyStatistics(FullDatabase, trialDir) - # Fill in the rate rules by averaging when we are ready to retrieve kinetics + print 'Performing the leave on out test on the kinetics families...' + leaveOneOut(FullDatabase, trialDir) + + # Fill in the rate rules by averaging when we are ready to compare real kinetics for family in FullDatabase.kinetics.families.values(): family.fillKineticsRulesByAveragingUp() print 'Evaluating the NIST Kinetics against the RMG estimates...' compareNIST(FullDatabase, trialDir) - - - print 'Performing the leave on out test on the kinetics families...' - leaveOneOut(FullDatabase, trialDir) \ No newline at end of file From e90692c928c7407d91d84ab706e522995d4fc925 Mon Sep 17 00:00:00 2001 From: Connie Gao Date: Fri, 22 Jul 2016 17:51:09 -0400 Subject: [PATCH 09/16] Improve print statements and change script to test all families by default --- testing/evaluateKinetics.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/testing/evaluateKinetics.py b/testing/evaluateKinetics.py index bc4783e5e7..703c35a8dc 100644 --- a/testing/evaluateKinetics.py +++ b/testing/evaluateKinetics.py @@ -263,7 +263,7 @@ def compareNIST(FullDatabase, trialDir): parityData=analyzeForParity(exactKinetics, approxKinetics, cutoff=8.0) if len(parityData)<2: - print ' Skipping', familyName, ': only one rate rule was calculated...' + print ' Skipping', familyName, ': {} reactions were compared...'.format(len(parityData)) continue QDict[familyName]=calculateQ(parityData) createParityPlot(parityData) @@ -358,7 +358,7 @@ def leaveOneOut(FullDatabase, trialDir): print 'Loading the RMG database...' FullDatabase=RMGDatabase() FullDatabase.load(settings['database.directory'], - kineticsFamilies=['Cyclic_Ether_Formation'], + kineticsFamilies='all', kineticsDepositories='all', thermoLibraries=['primaryThermoLibrary'], # Use just the primary thermo library, which contains necessary small molecular thermo reactionLibraries=[], @@ -368,17 +368,22 @@ def leaveOneOut(FullDatabase, trialDir): for family in FullDatabase.kinetics.families.values(): family.addKineticsRulesFromTrainingSet(thermoDatabase=FullDatabase.thermo) + print '--------------------------------------------' print 'Obtaining statistics for the families...' obtainKineticsFamilyStatistics(FullDatabase, trialDir) + print '--------------------------------------------' print 'Performing the leave on out test on the kinetics families...' leaveOneOut(FullDatabase, trialDir) + print '--------------------------------------------' + print 'Filling up the family rate rules by averaging... Expect larger number of rate rules in subsequent tests' # Fill in the rate rules by averaging when we are ready to compare real kinetics for family in FullDatabase.kinetics.families.values(): family.fillKineticsRulesByAveragingUp() + print '--------------------------------------------' print 'Evaluating the NIST Kinetics against the RMG estimates...' compareNIST(FullDatabase, trialDir) \ No newline at end of file From 797bece9874c1b290f103d1355bb646a6095925f Mon Sep 17 00:00:00 2001 From: Connie Gao Date: Sat, 23 Jul 2016 00:26:38 -0400 Subject: [PATCH 10/16] Fix NIST reactions Add NIST folder to ketoenol. Fix atom labeling for several reactions --- .../NIST/dictionary.txt | 4 +- .../R_Addition_MultipleBond/NIST/reactions.py | 2 +- .../intra_H_migration/NIST/dictionary.txt | 64 +++++++++---------- .../families/ketoenol/NIST/dictionary.txt | 0 .../families/ketoenol/NIST/reactions.py | 8 +++ 5 files changed, 43 insertions(+), 35 deletions(-) create mode 100644 input/kinetics/families/ketoenol/NIST/dictionary.txt create mode 100644 input/kinetics/families/ketoenol/NIST/reactions.py diff --git a/input/kinetics/families/Intra_R_Add_Endocyclic/NIST/dictionary.txt b/input/kinetics/families/Intra_R_Add_Endocyclic/NIST/dictionary.txt index a8df8e267e..93c415b7a8 100644 --- a/input/kinetics/families/Intra_R_Add_Endocyclic/NIST/dictionary.txt +++ b/input/kinetics/families/Intra_R_Add_Endocyclic/NIST/dictionary.txt @@ -1,7 +1,7 @@ C6H11-2 multiplicity 2 1 *4 C u0 p0 c0 {2,S} {3,S} {7,S} {8,S} -2 C u0 p0 c0 {1,S} {4,S} {9,S} {10,S} +2 *6 C u0 p0 c0 {1,S} {4,S} {9,S} {10,S} 3 *1 C u0 p0 c0 {1,S} {5,S} {11,S} {12,S} 4 *5 C u0 p0 c0 {2,S} {6,S} {13,S} {14,S} 5 *3 C u0 p0 c0 {3,S} {6,S} {15,S} {16,S} @@ -140,7 +140,7 @@ multiplicity 2 C6H11 multiplicity 2 -1 C u0 p0 c0 {2,S} {3,S} {7,S} {8,S} +1 *6 C u0 p0 c0 {2,S} {3,S} {7,S} {8,S} 2 *5 C u0 p0 c0 {1,S} {4,S} {9,S} {10,S} 3 *4 C u0 p0 c0 {1,S} {5,S} {11,S} {12,S} 4 *2 C u0 p0 c0 {2,S} {6,D} {13,S} diff --git a/input/kinetics/families/R_Addition_MultipleBond/NIST/reactions.py b/input/kinetics/families/R_Addition_MultipleBond/NIST/reactions.py index acf0a17e5e..4ed1137243 100644 --- a/input/kinetics/families/R_Addition_MultipleBond/NIST/reactions.py +++ b/input/kinetics/families/R_Addition_MultipleBond/NIST/reactions.py @@ -4333,7 +4333,7 @@ Ea = (100.6, 'kJ/mol'), T0 = (1, 'K'), Tmin = (400, 'K'), - Tmax = (400, 'K'), + Tmax = (2000, 'K'), ), reference = Article( authors = ["Hoyermann, K.", "Olzmann, M.", "Seeba, J.", "Viskolcz, B."], diff --git a/input/kinetics/families/intra_H_migration/NIST/dictionary.txt b/input/kinetics/families/intra_H_migration/NIST/dictionary.txt index 5f6f60dff6..bdcf138aa8 100644 --- a/input/kinetics/families/intra_H_migration/NIST/dictionary.txt +++ b/input/kinetics/families/intra_H_migration/NIST/dictionary.txt @@ -25,7 +25,7 @@ multiplicity 2 C5H11O2-2 multiplicity 2 1 *4 C u0 p0 c0 {2,S} {3,S} {4,S} {5,S} -2 C u0 p0 c0 {1,S} {6,S} {8,S} {9,S} +2 *6 C u0 p0 c0 {1,S} {6,S} {8,S} {9,S} 3 C u0 p0 c0 {1,S} {10,S} {11,S} {12,S} 4 C u0 p0 c0 {1,S} {13,S} {14,S} {15,S} 5 *1 C u1 p0 c0 {1,S} {16,S} {17,S} @@ -46,7 +46,7 @@ multiplicity 2 C4H9O multiplicity 2 1 *5 C u0 p0 c0 {2,S} {3,S} {5,S} {6,S} -2 C u0 p0 c0 {1,S} {4,S} {7,S} {8,S} +2 *6 C u0 p0 c0 {1,S} {4,S} {7,S} {8,S} 3 *2 C u0 p0 c0 {1,S} {10,S} {11,S} {12,S} 4 *4 C u0 p0 c0 {2,S} {9,S} {13,S} {14,S} 5 H u0 p0 c0 {1,S} @@ -64,7 +64,7 @@ C7H15O multiplicity 2 1 *2 C u0 p0 c0 {2,S} {5,S} {6,S} {8,S} 2 *5 C u0 p0 c0 {1,S} {3,S} {10,S} {11,S} -3 C u0 p0 c0 {2,S} {4,S} {12,S} {13,S} +3 *6 C u0 p0 c0 {2,S} {4,S} {12,S} {13,S} 4 *4 C u0 p0 c0 {3,S} {7,S} {9,S} {14,S} 5 C u0 p0 c0 {1,S} {15,S} {16,S} {17,S} 6 C u0 p0 c0 {1,S} {18,S} {19,S} {20,S} @@ -205,7 +205,7 @@ multiplicity 2 C4H9O-2 multiplicity 2 -1 C u0 p0 c0 {2,S} {3,S} {6,S} {7,S} +1 *6 C u0 p0 c0 {2,S} {3,S} {6,S} {7,S} 2 *4 C u0 p0 c0 {1,S} {4,S} {8,S} {9,S} 3 *5 C u0 p0 c0 {1,S} {5,S} {10,S} {11,S} 4 *1 C u1 p0 c0 {2,S} {12,S} {13,S} @@ -223,7 +223,7 @@ multiplicity 2 C6H13O-10 multiplicity 2 1 *5 C u0 p0 c0 {2,S} {3,S} {7,S} {8,S} -2 C u0 p0 c0 {1,S} {4,S} {9,S} {10,S} +2 *6 C u0 p0 c0 {1,S} {4,S} {9,S} {10,S} 3 C u0 p0 c0 {1,S} {5,S} {11,S} {12,S} 4 *4 C u0 p0 c0 {2,S} {6,S} {13,S} {14,S} 5 C u0 p0 c0 {3,S} {15,S} {16,S} {17,S} @@ -276,7 +276,7 @@ multiplicity 2 C6H11O2-4 multiplicity 2 -1 C u0 p0 c0 {2,S} {4,S} {7,S} {9,S} +1 *6 C u0 p0 c0 {2,S} {4,S} {7,S} {9,S} 2 C u0 p0 c0 {1,S} {3,S} {10,S} {11,S} 3 C u0 p0 c0 {2,S} {5,S} {12,S} {13,S} 4 *4 C u0 p0 c0 {1,S} {6,S} {14,S} {15,S} @@ -320,8 +320,8 @@ multiplicity 2 C6H11O2-2 multiplicity 2 -1 C u0 p0 c0 {2,S} {3,S} {7,S} {9,S} -2 C u0 p0 c0 {1,S} {4,S} {10,S} {11,S} +1 *7 C u0 p0 c0 {2,S} {3,S} {7,S} {9,S} +2 *6 C u0 p0 c0 {1,S} {4,S} {10,S} {11,S} 3 C u0 p0 c0 {1,S} {5,S} {12,S} {13,S} 4 *4 C u0 p0 c0 {2,S} {6,S} {14,S} {15,S} 5 C u0 p0 c0 {3,S} {6,S} {16,S} {17,S} @@ -342,7 +342,7 @@ multiplicity 2 C6H11O2-3 multiplicity 2 -1 C u0 p0 c0 {2,S} {3,S} {7,S} {8,S} +1 *6 C u0 p0 c0 {2,S} {3,S} {7,S} {8,S} 2 *5 C u0 p0 c0 {1,S} {5,S} {10,S} {11,S} 3 C u0 p0 c0 {1,S} {6,S} {12,S} {13,S} 4 C u0 p0 c0 {5,S} {6,S} {14,S} {15,S} @@ -387,7 +387,7 @@ multiplicity 2 C5H11O-6 multiplicity 2 1 *5 C u0 p0 c0 {2,S} {4,S} {6,S} {7,S} -2 C u0 p0 c0 {1,S} {3,S} {8,S} {9,S} +2 *6 C u0 p0 c0 {1,S} {3,S} {8,S} {9,S} 3 *4 C u0 p0 c0 {2,S} {5,S} {10,S} {11,S} 4 C u0 p0 c0 {1,S} {12,S} {13,S} {14,S} 5 *1 C u1 p0 c0 {3,S} {15,S} {16,S} @@ -406,7 +406,7 @@ multiplicity 2 C5H11O-4 multiplicity 2 -1 C u0 p0 c0 {2,S} {3,S} {4,S} {7,S} +1 *6 C u0 p0 c0 {2,S} {3,S} {4,S} {7,S} 2 *4 C u0 p0 c0 {1,S} {5,S} {8,S} {9,S} 3 *5 C u0 p0 c0 {1,S} {6,S} {10,S} {11,S} 4 C u0 p0 c0 {1,S} {12,S} {13,S} {14,S} @@ -426,7 +426,7 @@ multiplicity 2 C5H11O-5 multiplicity 2 -1 C u0 p0 c0 {2,S} {3,S} {7,S} {8,S} +1 *6 C u0 p0 c0 {2,S} {3,S} {7,S} {8,S} 2 *5 C u0 p0 c0 {1,S} {4,S} {9,S} {10,S} 3 *4 C u0 p0 c0 {1,S} {5,S} {6,S} {11,S} 4 *2 C u0 p0 c0 {2,S} {12,S} {13,S} {14,S} @@ -446,7 +446,7 @@ multiplicity 2 C5H11O-2 multiplicity 2 -1 C u0 p0 c0 {2,S} {3,S} {7,S} {8,S} +1 *6 C u0 p0 c0 {2,S} {3,S} {7,S} {8,S} 2 *4 C u0 p0 c0 {1,S} {5,S} {9,S} {10,S} 3 *5 C u0 p0 c0 {1,S} {6,S} {11,S} {12,S} 4 C u0 p0 c0 {5,S} {13,S} {14,S} {15,S} @@ -519,7 +519,7 @@ multiplicity 2 C6H13O-8 multiplicity 2 1 *5 C u0 p0 c0 {2,S} {4,S} {5,S} {7,S} -2 C u0 p0 c0 {1,S} {3,S} {8,S} {9,S} +2 *6 C u0 p0 c0 {1,S} {3,S} {8,S} {9,S} 3 *4 C u0 p0 c0 {2,S} {6,S} {10,S} {11,S} 4 C u0 p0 c0 {1,S} {12,S} {13,S} {14,S} 5 C u0 p0 c0 {1,S} {15,S} {16,S} {17,S} @@ -607,7 +607,7 @@ multiplicity 2 1 *2 C u0 p0 c0 {3,S} {5,S} {6,S} {10,S} 2 *4 C u0 p0 c0 {4,S} {7,S} {8,S} {9,S} 3 *5 C u0 p0 c0 {1,S} {4,S} {11,S} {12,S} -4 C u0 p0 c0 {2,S} {3,S} {13,S} {14,S} +4 *6 C u0 p0 c0 {2,S} {3,S} {13,S} {14,S} 5 C u0 p0 c0 {1,S} {15,S} {16,S} {17,S} 6 C u0 p0 c0 {1,S} {18,S} {19,S} {20,S} 7 C u0 p0 c0 {2,S} {21,S} {22,S} {23,S} @@ -665,8 +665,8 @@ multiplicity 2 C6H11O2 multiplicity 2 -1 C u0 p0 c0 {2,S} {3,S} {7,S} {8,S} -2 C u0 p0 c0 {1,S} {5,S} {10,S} {11,S} +1 *6 C u0 p0 c0 {2,S} {3,S} {7,S} {8,S} +2 *7 C u0 p0 c0 {1,S} {5,S} {10,S} {11,S} 3 C u0 p0 c0 {1,S} {6,S} {12,S} {13,S} 4 *2 C u0 p0 c0 {5,S} {6,S} {9,S} {14,S} 5 *5 C u0 p0 c0 {2,S} {4,S} {15,S} {16,S} @@ -783,7 +783,7 @@ multiplicity 2 C7H15O-4 multiplicity 2 1 *5 C u0 p0 c0 {2,S} {4,S} {5,S} {8,S} -2 C u0 p0 c0 {1,S} {3,S} {9,S} {10,S} +2 *6 C u0 p0 c0 {1,S} {3,S} {9,S} {10,S} 3 *4 C u0 p0 c0 {2,S} {7,S} {11,S} {12,S} 4 C u0 p0 c0 {1,S} {13,S} {14,S} {15,S} 5 C u0 p0 c0 {1,S} {16,S} {17,S} {18,S} @@ -828,7 +828,7 @@ multiplicity 2 C6H11-2 multiplicity 2 -1 C u0 p0 c0 {2,S} {3,S} {7,S} {8,S} +1 C u0 p0 c0 {2,S} {3,S} {7,S} {8,S} 2 *5 C u0 p0 c0 {1,S} {4,S} {9,S} {10,S} 3 *4 C u0 p0 c0 {1,S} {6,S} {11,S} {12,S} 4 *2 C u0 p0 c0 {2,S} {13,S} {14,S} {15,S} @@ -851,7 +851,7 @@ multiplicity 2 1 *2 C u0 p0 c0 {2,S} {3,S} {7,S} {8,S} 2 *5 C u0 p0 c0 {1,S} {4,S} {9,S} {10,S} 3 C u0 p0 c0 {1,S} {5,S} {11,S} {12,S} -4 C u0 p0 c0 {2,S} {6,S} {13,S} {14,S} +4 *6 C u0 p0 c0 {2,S} {6,S} {13,S} {14,S} 5 C u0 p0 c0 {3,S} {16,S} {17,S} {18,S} 6 *4 C u0 p0 c0 {4,S} {15,S} {19,S} {20,S} 7 *3 H u0 p0 c0 {1,S} @@ -872,7 +872,7 @@ multiplicity 2 C8H17O-2 multiplicity 2 1 *5 C u0 p0 c0 {2,S} {4,S} {5,S} {9,S} -2 C u0 p0 c0 {1,S} {3,S} {10,S} {11,S} +2 *6 C u0 p0 c0 {1,S} {3,S} {10,S} {11,S} 3 *4 C u0 p0 c0 {2,S} {8,S} {12,S} {13,S} 4 C u0 p0 c0 {1,S} {14,S} {15,S} {16,S} 5 C u0 p0 c0 {1,S} {17,S} {18,S} {19,S} @@ -929,7 +929,7 @@ multiplicity 2 C6H13O-7 multiplicity 2 1 *4 C u0 p0 c0 {2,S} {4,S} {5,S} {7,S} -2 C u0 p0 c0 {1,S} {3,S} {8,S} {9,S} +2 *6 C u0 p0 c0 {1,S} {3,S} {8,S} {9,S} 3 *5 C u0 p0 c0 {2,S} {6,S} {10,S} {11,S} 4 C u0 p0 c0 {1,S} {13,S} {14,S} {15,S} 5 C u0 p0 c0 {1,S} {16,S} {17,S} {18,S} @@ -952,7 +952,7 @@ multiplicity 2 C5H11O2 multiplicity 2 1 *5 C u0 p0 c0 {2,S} {3,S} {4,S} {5,S} -2 C u0 p0 c0 {1,S} {6,S} {7,S} {8,S} +2 *6 C u0 p0 c0 {1,S} {6,S} {7,S} {8,S} 3 *2 C u0 p0 c0 {1,S} {9,S} {10,S} {11,S} 4 C u0 p0 c0 {1,S} {12,S} {13,S} {14,S} 5 C u0 p0 c0 {1,S} {15,S} {16,S} {17,S} @@ -990,7 +990,7 @@ multiplicity 2 C6H13O-6 multiplicity 2 -1 C u0 p0 c0 {2,S} {3,S} {8,S} {9,S} +1 *6 C u0 p0 c0 {2,S} {3,S} {8,S} {9,S} 2 *4 C u0 p0 c0 {1,S} {6,S} {10,S} {11,S} 3 *5 C u0 p0 c0 {1,S} {7,S} {12,S} {13,S} 4 C u0 p0 c0 {6,S} {14,S} {15,S} {16,S} @@ -1013,7 +1013,7 @@ multiplicity 2 C6H13O-9 multiplicity 2 -1 C u0 p0 c0 {2,S} {3,S} {8,S} {9,S} +1 *6 C u0 p0 c0 {2,S} {3,S} {8,S} {9,S} 2 *4 C u0 p0 c0 {1,S} {4,S} {7,S} {10,S} 3 *5 C u0 p0 c0 {1,S} {5,S} {11,S} {12,S} 4 C u0 p0 c0 {2,S} {6,S} {13,S} {14,S} @@ -1058,7 +1058,7 @@ C6H13O-5 multiplicity 2 1 *2 C u0 p0 c0 {2,S} {4,S} {5,S} {7,S} 2 *5 C u0 p0 c0 {1,S} {3,S} {8,S} {9,S} -3 C u0 p0 c0 {2,S} {6,S} {10,S} {11,S} +3 *6 C u0 p0 c0 {2,S} {6,S} {10,S} {11,S} 4 C u0 p0 c0 {1,S} {13,S} {14,S} {15,S} 5 C u0 p0 c0 {1,S} {16,S} {17,S} {18,S} 6 *4 C u0 p0 c0 {3,S} {12,S} {19,S} {20,S} @@ -1098,7 +1098,7 @@ multiplicity 2 C6H13O-3 multiplicity 2 1 *5 C u0 p0 c0 {2,S} {3,S} {9,S} {10,S} -2 C u0 p0 c0 {1,S} {4,S} {11,S} {12,S} +2 *6 C u0 p0 c0 {1,S} {4,S} {11,S} {12,S} 3 *2 C u0 p0 c0 {1,S} {5,S} {8,S} {13,S} 4 *4 C u0 p0 c0 {2,S} {6,S} {7,S} {14,S} 5 C u0 p0 c0 {3,S} {15,S} {16,S} {17,S} @@ -1120,7 +1120,7 @@ multiplicity 2 C6H13O-2 multiplicity 2 -1 C u0 p0 c0 {2,S} {4,S} {8,S} {9,S} +1 *6 C u0 p0 c0 {2,S} {4,S} {8,S} {9,S} 2 *4 C u0 p0 c0 {1,S} {6,S} {10,S} {11,S} 3 C u0 p0 c0 {5,S} {6,S} {12,S} {13,S} 4 *5 C u0 p0 c0 {1,S} {7,S} {14,S} {15,S} @@ -1144,7 +1144,7 @@ multiplicity 2 C7H15O-2 multiplicity 2 1 *5 C u0 p0 c0 {2,S} {4,S} {8,S} {9,S} -2 C u0 p0 c0 {1,S} {3,S} {10,S} {11,S} +2 *6 C u0 p0 c0 {1,S} {3,S} {10,S} {11,S} 3 *4 C u0 p0 c0 {2,S} {7,S} {12,S} {13,S} 4 C u0 p0 c0 {1,S} {14,S} {15,S} {16,S} 5 C u0 p0 c0 {7,S} {17,S} {18,S} {19,S} @@ -1170,7 +1170,7 @@ multiplicity 2 C7H15O-3 multiplicity 2 1 *4 C u0 p0 c0 {2,S} {5,S} {6,S} {8,S} -2 C u0 p0 c0 {1,S} {3,S} {10,S} {11,S} +2 *6 C u0 p0 c0 {1,S} {3,S} {10,S} {11,S} 3 *5 C u0 p0 c0 {2,S} {4,S} {12,S} {13,S} 4 *2 C u0 p0 c0 {3,S} {7,S} {9,S} {14,S} 5 C u0 p0 c0 {1,S} {15,S} {16,S} {17,S} @@ -1235,7 +1235,7 @@ multiplicity 2 C6H11 multiplicity 2 -1 C u0 p0 c0 {2,S} {3,S} {7,S} {8,S} +1 *6 C u0 p0 c0 {2,S} {3,S} {7,S} {8,S} 2 *5 C u0 p0 c0 {1,S} {4,S} {9,S} {10,S} 3 *4 C u0 p0 c0 {1,S} {5,S} {11,S} {12,S} 4 *2 C u0 p0 c0 {2,S} {6,D} {13,S} @@ -1256,7 +1256,7 @@ multiplicity 2 C6H13O-4 multiplicity 2 1 *5 C u0 p0 c0 {2,S} {4,S} {7,S} {8,S} -2 C u0 p0 c0 {1,S} {3,S} {9,S} {10,S} +2 *6 C u0 p0 c0 {1,S} {3,S} {9,S} {10,S} 3 *4 C u0 p0 c0 {2,S} {6,S} {11,S} {12,S} 4 C u0 p0 c0 {1,S} {13,S} {14,S} {15,S} 5 C u0 p0 c0 {6,S} {16,S} {17,S} {18,S} diff --git a/input/kinetics/families/ketoenol/NIST/dictionary.txt b/input/kinetics/families/ketoenol/NIST/dictionary.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/input/kinetics/families/ketoenol/NIST/reactions.py b/input/kinetics/families/ketoenol/NIST/reactions.py new file mode 100644 index 0000000000..1857edb8bc --- /dev/null +++ b/input/kinetics/families/ketoenol/NIST/reactions.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python +# encoding: utf-8 + +name = "ketoenol/NIST" +shortDesc = u"" +longDesc = u""" + +""" From 53d79077ad3a002a3ba7fe91fe1e59b9ccf85e12 Mon Sep 17 00:00:00 2001 From: Connie Gao Date: Tue, 26 Jul 2016 15:12:51 -0400 Subject: [PATCH 11/16] More comment fixing in script --- testing/evaluateKinetics.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/testing/evaluateKinetics.py b/testing/evaluateKinetics.py index 703c35a8dc..0a074d3af2 100644 --- a/testing/evaluateKinetics.py +++ b/testing/evaluateKinetics.py @@ -14,7 +14,6 @@ import re import copy import csv -from time import time from rmgpy.thermo import * from rmgpy.kinetics import * @@ -212,7 +211,8 @@ def obtainKineticsFamilyStatistics(FullDatabase, trialDir): as it would add non-exact rules to the rule count) """ allFamilyNames=FullDatabase.kinetics.families.keys() - + allFamilyNames.sort() # Perform test on families alphabetically + familyCount={} for familyName in allFamilyNames: @@ -240,6 +240,7 @@ def compareNIST(FullDatabase, trialDir): allFamilyNames=FullDatabase.kinetics.families.keys() + allFamilyNames.sort() # Perform test on families alphabetically QDict={} @@ -305,7 +306,7 @@ def leaveOneOut(FullDatabase, trialDir): os.makedirs(trialDir) allFamilyNames=FullDatabase.kinetics.families.keys() - + allFamilyNames.sort() # Perform test on families alphabetically QDict={} for familyName in allFamilyNames: @@ -314,16 +315,11 @@ def leaveOneOut(FullDatabase, trialDir): if len(family.rules.entries) < 2: print ' Skipping', familyName, ': only has one rate rule...' else: - - start_time = time() exactKinetics, approxKinetics = getKineticsLeaveOneOut(family) - end_time = time() - time_taken = end_time - start_time - print "Time spent: {0:.2f} minutes".format(time_taken/60.0) parityData=analyzeForParity(exactKinetics, approxKinetics, cutoff=8.0) if len(parityData)<2: - print ' Skipping', familyName, ': only one rate rule was calculated...' + print ' Skipping', familyName, ': {} rate rules were compared...'.format(len(parityData)) continue QDict[familyName]=calculateQ(parityData) createParityPlot(parityData) From cb2f649b2c67310bb1fb2ed0f64fcc00f1d702a2 Mon Sep 17 00:00:00 2001 From: Connie Gao Date: Tue, 26 Jul 2016 15:14:42 -0400 Subject: [PATCH 12/16] Add argument for performing leaveOneOutTest without averaging Is not quite the original leaveOneOut test as it was done before. The original test takes extremely long to run because it must average up the family after each exact rate rule. In this method, we remove the original rate rule, and see if using an average that (should incorporate the original) does a good job of estimating that rate rule. The option to use the original version of the algorithm can still be used by setting averaging=True Write data to csv file after each family rather than after entire set of families so that they are in alphabetical order --- testing/evaluateKinetics.py | 209 +++++++++++++++++++++--------------- 1 file changed, 124 insertions(+), 85 deletions(-) diff --git a/testing/evaluateKinetics.py b/testing/evaluateKinetics.py index 0a074d3af2..3a2dc28ace 100644 --- a/testing/evaluateKinetics.py +++ b/testing/evaluateKinetics.py @@ -75,7 +75,7 @@ def getKineticsDepository(FullDatabase, family, depositoryLabel): return exactKinetics, approxKinetics -def getKineticsLeaveOneOut(family): +def getKineticsLeaveOneOut(family, averaging=True): """ Performs the leave one out test on a family. It returns a dictionary of the original exact nodes and a dictionary of the new averaged nodes. @@ -89,15 +89,52 @@ def getKineticsLeaveOneOut(family): exactKinetics={} approxKinetics={} + rootTemplate = family.getRootTemplate() + for entryKey in family.rules.entries.keys(): template = family.retrieveTemplate(entryKey.split(';')) - exactKinetics[entryKey], exactKineticsEntry=family.rules.estimateKinetics(template) + exactKineticsEntry=family.rules.getRule(template) # fetch the best matched rule + if exactKineticsEntry.rank == 0: + # Skip rank zero entries, because they are replaced by averaged values + # during model generation + continue + + exactKineticsData = exactKineticsEntry.data + if exactKineticsData.comment: + if 'training' not in exactKineticsData.comment: + # Averaged nodes have kinetics data comments, so skip them unless + # They were created from training reactions + continue + + exactKinetics[entryKey] = exactKineticsData + + if averaging: + # In this scheme, we remove the data fully and try to pretend the database + # wants this kinetic value when we know nothing about it + familyCopy=copy.deepcopy(family) + familyCopy.rules.entries.pop(entryKey) + familyCopy.fillKineticsRulesByAveragingUp() + approxKinetics[entryKey], approxKineticsEntry=familyCopy.rules.estimateKinetics(template) + else: + # In this scheme, do not re-average the tree, just try to see if the nearest + # best node provides a good estimate for the original kinetics. + # This takes significanty less time to run, but is not a true validation of the data, + # it is just testing our kinetics selection algorithm + + # Skip the top node in this scheme, because nothing can predict it if removed + + if template == rootTemplate: + continue + + originalEntry = family.rules.entries[entryKey] + family.rules.entries.pop(entryKey) + approxKinetics[entryKey], approxKineticsEntry=family.rules.estimateKinetics(template) + # Re-add the data back into the original family + family.rules.entries[entryKey] = originalEntry + + - familyCopy=copy.deepcopy(family) - familyCopy.rules.entries.pop(entryKey) - familyCopy.fillKineticsRulesByAveragingUp() - approxKinetics[entryKey], approxKineticsEntry=familyCopy.rules.estimateKinetics(template) return exactKinetics, approxKinetics @@ -211,20 +248,17 @@ def obtainKineticsFamilyStatistics(FullDatabase, trialDir): as it would add non-exact rules to the rule count) """ allFamilyNames=FullDatabase.kinetics.families.keys() - allFamilyNames.sort() # Perform test on families alphabetically - - familyCount={} + allFamilyNames.sort(key=str.lower) # Perform test on families alphabetically - for familyName in allFamilyNames: - family=FullDatabase.kinetics.families[familyName] - print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' rate rules)' - familyCount[familyName]=countNodes(family) - with open(os.path.join(trialDir, 'FamilyStatistics.csv'), 'wb') as csvfile: csvwriter=csv.writer(csvfile) csvwriter.writerow(['Family','Number of Rules', 'Top Node 1', 'Number of Groups', 'Top Node 2', 'Number of Groups', 'Top Node 3', 'Number of Groups']) - for key, value in familyCount.iteritems(): - csvwriter.writerow(value) + + for familyName in allFamilyNames: + family=FullDatabase.kinetics.families[familyName] + print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' rate rules)' + # Save to csv file + csvwriter.writerow(countNodes(family)) def compareNIST(FullDatabase, trialDir): @@ -240,56 +274,56 @@ def compareNIST(FullDatabase, trialDir): allFamilyNames=FullDatabase.kinetics.families.keys() - allFamilyNames.sort() # Perform test on families alphabetically + allFamilyNames.sort(key=str.lower) # Perform test on families alphabetically QDict={} - for familyName in allFamilyNames: - family=FullDatabase.kinetics.families[familyName] - print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' rate rules)' - if len(family.rules.entries) < 2: - print ' Skipping', familyName, ': only has one rate rule...' - else: - exactKinetics, approxKinetics = getKineticsDepository(FullDatabase, family, 'NIST') - - #prune for exact matches only - keysToRemove=[] - for key, kinetics in approxKinetics.iteritems(): - if not re.search('Exact', kinetics.comment): - keysToRemove.append(key) - - for key in keysToRemove: - del approxKinetics[key] - - parityData=analyzeForParity(exactKinetics, approxKinetics, cutoff=8.0) - - if len(parityData)<2: - print ' Skipping', familyName, ': {} reactions were compared...'.format(len(parityData)) - continue - QDict[familyName]=calculateQ(parityData) - createParityPlot(parityData) - plt.title(familyName) - plt.savefig(os.path.join(trialDir, familyName +'.png')) - plt.clf() - - if not os.path.exists(os.path.join(os.path.join(trialDir, 'ParityData'))): - os.makedirs(os.path.join(trialDir, 'ParityData')) - - with open(os.path.join(trialDir, 'ParityData', familyName + '.csv'), 'wb') as csvfile: - csvwriter=csv.writer(csvfile) - for key, value in parityData.iteritems(): - csvwriter.writerow([key, value[0]/value[1], approxKinetics[key].comment]) - with open(os.path.join(trialDir, 'QDict_LOO.csv'), 'wb') as csvfile: csvwriter=csv.writer(csvfile) - for key, value in QDict.iteritems(): - csvwriter.writerow([key, value]) + for familyName in allFamilyNames: + family=FullDatabase.kinetics.families[familyName] + print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' rate rules)' + if len(family.rules.entries) < 2: + print ' Skipping', familyName, ': only has one rate rule...' + else: + exactKinetics, approxKinetics = getKineticsDepository(FullDatabase, family, 'NIST') + + #prune for exact matches only + keysToRemove=[] + for key, kinetics in approxKinetics.iteritems(): + if not re.search('Exact', kinetics.comment): + keysToRemove.append(key) + + for key in keysToRemove: + del approxKinetics[key] + + parityData=analyzeForParity(exactKinetics, approxKinetics, cutoff=8.0) + + if len(parityData)<2: + print ' Skipping', familyName, ': {} reactions were compared...'.format(len(parityData)) + continue + QDict[familyName]=calculateQ(parityData) + createParityPlot(parityData) + plt.title(familyName) + plt.savefig(os.path.join(trialDir, familyName +'.png')) + plt.clf() + + if not os.path.exists(os.path.join(os.path.join(trialDir, 'ParityData'))): + os.makedirs(os.path.join(trialDir, 'ParityData')) + + with open(os.path.join(trialDir, 'ParityData', familyName + '.csv'), 'wb') as paritycsvfile: + paritycsvwriter=csv.writer(paritycsvfile) + for key, value in parityData.iteritems(): + paritycsvwriter.writerow([key, value[0]/value[1], approxKinetics[key].comment]) + + # Save data to csv file + csvwriter.writerow([familyName, QDict[familyName]]) -def leaveOneOut(FullDatabase, trialDir): +def leaveOneOut(FullDatabase, trialDir, averaging=True): """ Performs leave one out analysis on all the kinetics families. The algorithm deletes a single entry in the family, and then re-averages the tree @@ -306,39 +340,44 @@ def leaveOneOut(FullDatabase, trialDir): os.makedirs(trialDir) allFamilyNames=FullDatabase.kinetics.families.keys() - allFamilyNames.sort() # Perform test on families alphabetically + allFamilyNames.sort(key=str.lower) # Perform test on families alphabetically QDict={} - for familyName in allFamilyNames: - family=FullDatabase.kinetics.families[familyName] - print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' rate rules)' - if len(family.rules.entries) < 2: - print ' Skipping', familyName, ': only has one rate rule...' - else: - exactKinetics, approxKinetics = getKineticsLeaveOneOut(family) - parityData=analyzeForParity(exactKinetics, approxKinetics, cutoff=8.0) - - if len(parityData)<2: - print ' Skipping', familyName, ': {} rate rules were compared...'.format(len(parityData)) - continue - QDict[familyName]=calculateQ(parityData) - createParityPlot(parityData) - plt.title(familyName) - plt.savefig(os.path.join(trialDir, familyName +'.png')) - plt.clf() - - if not os.path.exists(os.path.join(os.path.join(trialDir, 'ParityData'))): - os.makedirs(os.path.join(trialDir, 'ParityData')) - - with open(os.path.join(trialDir, 'ParityData', familyName + '.csv'), 'wb') as csvfile: - csvwriter=csv.writer(csvfile) - for key, value in parityData.iteritems(): - csvwriter.writerow([key, value[0]/value[1], approxKinetics[key].comment]) with open(os.path.join(trialDir, 'QDict_LOO.csv'), 'wb') as csvfile: csvwriter=csv.writer(csvfile) - for key, value in QDict.iteritems(): - csvwriter.writerow([key, value]) + + for familyName in allFamilyNames: + family=FullDatabase.kinetics.families[familyName] + print "Processing", familyName + '...', '(' + str(len(family.rules.entries)) + ' rate rules)' + if len(family.rules.entries) < 2: + print ' Skipping', familyName, ': only has one rate rule...' + else: + if not averaging: + # Pre-average the family if averaging is not turned on + family.fillKineticsRulesByAveragingUp() + exactKinetics, approxKinetics = getKineticsLeaveOneOut(family, averaging) + parityData=analyzeForParity(exactKinetics, approxKinetics, cutoff=8.0) + + if len(parityData)<2: + print ' Skipping', familyName, ': {} rate rules were compared...'.format(len(parityData)) + continue + QDict[familyName]=calculateQ(parityData) + createParityPlot(parityData) + plt.title(familyName) + plt.savefig(os.path.join(trialDir, familyName +'.png')) + plt.clf() + + if not os.path.exists(os.path.join(os.path.join(trialDir, 'ParityData'))): + os.makedirs(os.path.join(trialDir, 'ParityData')) + + with open(os.path.join(trialDir, 'ParityData', familyName + '.csv'), 'wb') as paritycsvfile: + paritycsvwriter=csv.writer(paritycsvfile) + for key, value in parityData.iteritems(): + paritycsvwriter.writerow([key, value[0]/value[1], approxKinetics[key].comment]) + + # Save to csv file + csvwriter.writerow([familyName, QDict[familyName]]) return @@ -370,7 +409,7 @@ def leaveOneOut(FullDatabase, trialDir): print '--------------------------------------------' print 'Performing the leave on out test on the kinetics families...' - leaveOneOut(FullDatabase, trialDir) + leaveOneOut(FullDatabase, trialDir, averaging=False) print '--------------------------------------------' print 'Filling up the family rate rules by averaging... Expect larger number of rate rules in subsequent tests' From 20c2ee9954333f4b2032125059696e407cc43717 Mon Sep 17 00:00:00 2001 From: Connie Gao Date: Wed, 27 Jul 2016 12:12:33 -0400 Subject: [PATCH 13/16] Remove the cutoff argument for in analyzeForParity This used to excise parity data, it's better to keep them for complete statistical analysis --- testing/evaluateKinetics.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/testing/evaluateKinetics.py b/testing/evaluateKinetics.py index 3a2dc28ace..fe32674621 100644 --- a/testing/evaluateKinetics.py +++ b/testing/evaluateKinetics.py @@ -150,7 +150,7 @@ def calculateParity(exactKineticModel, approxKineticModel, T): return float(approx)/float(exact) -def analyzeForParity(exactKinetics, approxKinetics, T=1000.0, cutoff=0.0): +def analyzeForParity(exactKinetics, approxKinetics, T=1000.0): """ Creates a parity plot from the exactKinetics and approxKinetics (dictionarys with kineticModels are entries). Uses the median temperature of the exactKinetics to @@ -163,8 +163,6 @@ def analyzeForParity(exactKinetics, approxKinetics, T=1000.0, cutoff=0.0): exact=exactKinetics[key].getRateCoefficient(T) approx=approxKinetics[key].getRateCoefficient(T) dataPoint=[exact, approx] - if cutoff!=0 and math.log10((float(exact)/float(approx)))**2 > cutoff**2: - continue parityData[key]=dataPoint return parityData @@ -297,7 +295,7 @@ def compareNIST(FullDatabase, trialDir): for key in keysToRemove: del approxKinetics[key] - parityData=analyzeForParity(exactKinetics, approxKinetics, cutoff=8.0) + parityData=analyzeForParity(exactKinetics, approxKinetics) if len(parityData)<2: print ' Skipping', familyName, ': {} reactions were compared...'.format(len(parityData)) @@ -357,7 +355,7 @@ def leaveOneOut(FullDatabase, trialDir, averaging=True): # Pre-average the family if averaging is not turned on family.fillKineticsRulesByAveragingUp() exactKinetics, approxKinetics = getKineticsLeaveOneOut(family, averaging) - parityData=analyzeForParity(exactKinetics, approxKinetics, cutoff=8.0) + parityData=analyzeForParity(exactKinetics, approxKinetics) if len(parityData)<2: print ' Skipping', familyName, ': {} rate rules were compared...'.format(len(parityData)) From 9edd39158be54b0f5f011b7da60accc115b75397 Mon Sep 17 00:00:00 2001 From: Connie Gao Date: Wed, 27 Jul 2016 12:13:58 -0400 Subject: [PATCH 14/16] Add an argument `pruneForExact` in compareNIST to return only exact RMG kinetics in parityData User gets to choose whether it wants all kinetics in the parity Data (both exact and non-exact), or only the exact ones --- testing/evaluateKinetics.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/testing/evaluateKinetics.py b/testing/evaluateKinetics.py index fe32674621..7a4d2e628e 100644 --- a/testing/evaluateKinetics.py +++ b/testing/evaluateKinetics.py @@ -259,12 +259,15 @@ def obtainKineticsFamilyStatistics(FullDatabase, trialDir): csvwriter.writerow(countNodes(family)) -def compareNIST(FullDatabase, trialDir): +def compareNIST(FullDatabase, trialDir, pruneForExact=False): """ Compare NIST reaction kinetics with estimates from RMG. Creates parity plot and calculates the predicted root mean square error from the families. Note: does NOT average up the database or create any rate rules from training data. If that is desired it must be done prior to entering this function. + + if pruneForExact=True, only the comparisons again exact match kinetics from RMG are returned + otherwise, estimated and exact matches are all included in the parity data. """ trialDir=os.path.join(trialDir, 'compareNIST') if not os.path.exists(trialDir): @@ -286,14 +289,16 @@ def compareNIST(FullDatabase, trialDir): else: exactKinetics, approxKinetics = getKineticsDepository(FullDatabase, family, 'NIST') - #prune for exact matches only - keysToRemove=[] - for key, kinetics in approxKinetics.iteritems(): - if not re.search('Exact', kinetics.comment): - keysToRemove.append(key) - for key in keysToRemove: - del approxKinetics[key] + if pruneForExact: + # prune for exact matches only + keysToRemove=[] + for key, kinetics in approxKinetics.iteritems(): + if not re.search('Exact', kinetics.comment): + keysToRemove.append(key) + + for key in keysToRemove: + del approxKinetics[key] parityData=analyzeForParity(exactKinetics, approxKinetics) From 8f0c92201b2db652e926eb0f29dfbd4a672b06bc Mon Sep 17 00:00:00 2001 From: Connie Gao Date: Wed, 27 Jul 2016 12:14:32 -0400 Subject: [PATCH 15/16] Safer re.escape for finding depository using its label --- testing/evaluateKinetics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing/evaluateKinetics.py b/testing/evaluateKinetics.py index 7a4d2e628e..febce5a5f5 100644 --- a/testing/evaluateKinetics.py +++ b/testing/evaluateKinetics.py @@ -36,7 +36,7 @@ def getKineticsDepository(FullDatabase, family, depositoryLabel): depository = None for tempDepository in family.depositories: - if re.search(depositoryLabel, tempDepository.label): + if re.search(re.escape(depositoryLabel), tempDepository.label): depository=tempDepository break else: From 78e42c5756b2bd625e342400a2903782ccb1b308 Mon Sep 17 00:00:00 2001 From: Connie Gao Date: Wed, 27 Jul 2016 12:15:10 -0400 Subject: [PATCH 16/16] Some fixed docstrings and lengthier explanations --- testing/evaluateKinetics.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/testing/evaluateKinetics.py b/testing/evaluateKinetics.py index febce5a5f5..dada1abb1e 100644 --- a/testing/evaluateKinetics.py +++ b/testing/evaluateKinetics.py @@ -29,7 +29,8 @@ def getKineticsDepository(FullDatabase, family, depositoryLabel): """ - Retrieve the NIST exact and approximated kinetics for those same reactions from RMG. + Retrieve dictionaries of exact kinetics from NIST depository and approximated kinetics + for those same reactions from RMG. Note: does NOT average up the database or create any rate rules from training data. If that is desired it must be done prior to entering this function. """ @@ -85,6 +86,19 @@ def getKineticsLeaveOneOut(family, averaging=True): The original family should not contained averaged nodes when starting out. The leave one out test should be performed only for original exact matches. + + There are two methods of performing the leave one out test: + + averaging=True, each exact entry is removed, and the family's trees + are re-averaged WITHOUT that entry and used to estimate the original entry. + This method tests how the database performs to estimate data when it is + missing. Speed of this test is slow. + + averaging=False, the family is pre-averaged, and then the exact entry + is removed, and RMG is used to re-estimate that entry. This method + tests how the database averaging scheme performs, since we expect the + estimate to use a more general node that still incorporates the data + from the original exact entry. Speed of this test is fast. """ exactKinetics={} approxKinetics={} @@ -153,8 +167,8 @@ def calculateParity(exactKineticModel, approxKineticModel, T): def analyzeForParity(exactKinetics, approxKinetics, T=1000.0): """ Creates a parity plot from the exactKinetics and approxKinetics (dictionarys with - kineticModels are entries). Uses the median temperature of the exactKinetics to - give the best comparison. Returns the parityData in a dictionary with + kineticModels are entries). Uses the user specified temperature T to + compare the two sets of kinetics k(T). Returns the parityData in a dictionary with {key: [exactCoefficient(T), approxCoefficient(T)} """ parityData={} @@ -215,7 +229,7 @@ def countNodes(family): else: # Duplicate node found at top of tree # eg. R_recombination: ['Y_rad', 'Y_rad'] - assert len(forwardTemplate)==2 , 'Can currently only do symmetric trees with nothing else in them' + assert len(forwardTemplate)==2 , 'Duplicate entries in the top nodes must only be in symmetric trees containing exactly 2 total nodes, i.e. [Yrad, Yrad]' forwardTemplate = temporary