Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
3d978a5
added pycharm to ignore
EFuem Apr 5, 2022
83287da
added pycharm to ignore
EFuem Apr 5, 2022
dfd2922
added pycharm to ignore
EFuem Apr 5, 2022
8f3b17f
Fixed doc typo
EFuem Apr 12, 2022
cb311e6
Added hash for CS/DS and added update functions for each
EFuem Apr 18, 2022
05edc41
added _hash attribute
EFuem May 25, 2022
8375304
hash changes
EFuem Jun 1, 2022
81cf903
merged preliminary version control into development
EFuem Jun 1, 2022
891af83
fixed database merge conflicts
EFuem Jun 1, 2022
74bd8c5
Changed colabfit_id generation
EFuem Jun 2, 2022
8aa540e
trivial changes
EFuem Jun 3, 2022
249c047
Merge branch 'development' of https://github.com/colabfit/colabfit-to…
EFuem Jun 3, 2022
320cbfe
Merge branch 'development' of https://github.com/colabfit/colabfit-to…
EFuem Jun 3, 2022
b1f0220
Merge branch 'development' of https://github.com/colabfit/colabfit-to…
EFuem Jun 3, 2022
146a64c
Remove author character constraints. #23
jvita Jun 6, 2022
db54d99
Adding DS name character constraints
jvita Jun 6, 2022
e632c9e
Changed ID formatting
EFuem Jun 8, 2022
4d18b4b
hash and naming changes
EFuem Jun 8, 2022
be03ada
fixed merge conflicts development/hashing_fixes
EFuem Jun 8, 2022
45c556b
Merged local and remote development
EFuem Jun 8, 2022
a72e429
Upsert using hash
EFuem Jun 8, 2022
ee7d7f6
Unit test changes
EFuem Jun 8, 2022
3eecdc5
Checkpointing work
jvita Jun 9, 2022
54ef825
Merge branch 'testing_hash_changes' of github.com:jvita/colabfit-tool…
jvita Jun 9, 2022
dc161a7
Checkpointing debugging work
jvita Jun 9, 2022
c1b679b
Removed support for Short ID in all but CS and DS
EFuem Jun 21, 2022
1bdc3ec
Debug new IDs
EFuem Jun 22, 2022
996046f
Debugged tests with ID changes
EFuem Jun 22, 2022
1fd09c8
Extended ID changes
EFuem Jun 22, 2022
2bdef2b
Added Django to requirements
EFuem Jun 22, 2022
61dfb45
Fixed stress definition
EFuem Jun 23, 2022
974121d
Changed short-id to include only lowercase alphanumeric
EFuem Jun 23, 2022
7a064dd
Merge branch 'testing_hash_changes' of github.com:jvita/colabfit-tool…
jvita Jun 23, 2022
3c1ae7a
Add globbing to XYZ; allow elements=None
jvita Aug 12, 2022
473e655
Adding default setting keys
jvita Aug 12, 2022
e98016e
Add free energy definition
jvita Aug 12, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ plotly
pymongo
biopython
pytest
wheel
wheel
django
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -142,5 +142,6 @@ colabfit/api/data/uploads
.vscode

#pycharm
.idea
.idea/
.iml

3 changes: 2 additions & 1 deletion colabfit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
HASH_SHIFT = 0
# HASH_SHIFT = 2**63

ID_FORMAT_STRING= '{}_{:05d}_{:03d}'
ID_FORMAT_STRING= '{}_{}_{:03d}'

MAX_STRING_LENGTH = 255
STRING_DTYPE_SPECIFIER = f'S{MAX_STRING_LENGTH}'
Expand All @@ -19,6 +19,7 @@
_PROPSETTINGS_COLLECTION = 'property_settings'
_CONFIGSETS_COLLECTION = 'configuration_sets'
_DATASETS_COLLECTION = 'datasets'
_COUNTERS_COLLECTION = 'counters'

SHORT_ID_STRING_NAME = 'short-id'
EXTENDED_ID_STRING_NAME = 'extended-id'
Expand Down
264 changes: 263 additions & 1 deletion colabfit/examples/ANI-1/ANI-1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,268 @@
"This notebook serves as an example of how to load and manipulate the [ANI-1 dataset](https://github.com/isayev/ANI1_dataset) using a `Dataset` object."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "1953e930",
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"import numpy as np\n",
"\n",
"from colabfit import SHORT_ID_STRING_NAME\n",
"\n",
"from colabfit.tools.database import MongoDatabase, load_data\n",
"from colabfit.tools.property_settings import PropertySettings\n",
"from colabfit.tools.configuration import AtomicConfiguration"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "4072825e",
"metadata": {},
"outputs": [],
"source": [
"client = MongoDatabase('ani1_rebuild', configuration_type=AtomicConfiguration, nprocs=4, drop_database=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "626c5843",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"\n",
"my_path_to_pyanitools = '/home/jvita/scripts/colabfit/data/ANI-1_release/'\n",
"sys.path.append(my_path_to_pyanitools)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "bf530739",
"metadata": {},
"outputs": [],
"source": [
"import pyanitools as pya"
]
},
{
"cell_type": "markdown",
"id": "e5ee122d",
"metadata": {},
"source": [
"# To do:\n",
"\n",
"* Merge all of the HDF5 files into a single file, that way you can parallelize over that"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "bdda8c5e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 20.50it/s]\n",
"100%|█████████████████████████████████████████████████████████████████████████| 47932/47932 [01:53<00:00, 420.76it/s]\n",
"100%|███████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 765.38it/s]\n",
"100%|█████████████████████████████████████████████████████████████████████████████| 267/267 [00:00<00:00, 328.24it/s]\n",
"100%|███████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 284.94it/s]\n",
"100%|███████████████████████████████████████████████████████████████████████████████| 61/61 [00:00<00:00, 249.51it/s]\n",
"100%|███████████████████████████████████████████████████████████████████████████| 1406/1406 [00:04<00:00, 321.81it/s]\n",
"100%|███████████████████████████████████████████████████████████████████████████| 7760/7760 [00:24<00:00, 317.73it/s]\n"
]
}
],
"source": [
"import os\n",
"import glob\n",
"import h5py\n",
"from tqdm import tqdm\n",
"\n",
"master_file_name = '/home/jvita/scripts/colabfit/data/ANI-1_release/merged.h5'\n",
"\n",
"counter = 0\n",
"\n",
"with h5py.File(master_file_name, 'w') as merged:\n",
" for file_path in glob.glob('/home/jvita/scripts/colabfit/data/ANI-1_release/ani_*.h5'):\n",
" with h5py.File(file_path, 'r') as hdf5:\n",
" for group_name in hdf5:\n",
" for sub in tqdm(hdf5[group_name]):\n",
" merged[sub] = h5py.ExternalLink(file_path, os.path.join(group_name, sub))\n",
" \n",
" counter += hdf5[group_name][sub]['coordinates'].shape[0]\n",
"\n",
" if 'coordinatesHE' in hdf5[group_name][sub]:\n",
" counter += hdf5[group_name][sub]['coordinatesHE'].shape[0]\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "dd7ff702",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"57462"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with h5py.File(master_file_name, 'a') as merged:\n",
" group_keys = list(merged.keys())\n",
" \n",
"len(group_keys)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "6eec16e4",
"metadata": {},
"outputs": [],
"source": [
"random.shuffle(group_keys)\n",
"split_keys = [_.tolist() for _ in np.split(np.array(group_keys), 6)]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "7da2a0fb",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|█████████████████████████████████████████████████████████████████████████| 9577/9577 [00:00<00:00, 21296.84it/s]\n",
"100%|█████████████████████████████████████████████████████████████████████████| 9577/9577 [00:00<00:00, 26150.91it/s]\n",
"100%|█████████████████████████████████████████████████████████████████████████| 9577/9577 [00:00<00:00, 26061.30it/s]\n",
"100%|█████████████████████████████████████████████████████████████████████████| 9577/9577 [00:00<00:00, 25939.50it/s]\n",
"100%|█████████████████████████████████████████████████████████████████████████| 9577/9577 [00:00<00:00, 25981.56it/s]\n",
"100%|█████████████████████████████████████████████████████████████████████████| 9577/9577 [00:00<00:00, 26087.58it/s]\n"
]
}
],
"source": [
"for i, keys in enumerate(split_keys):\n",
" split_file_name = f'/home/jvita/scripts/colabfit/data/ANI-1_release/split_{i}.h5'\n",
" \n",
" with h5py.File(split_file_name, 'w') as hdf5:\n",
" for k in tqdm(keys):\n",
" hdf5[k] = h5py.ExternalLink(master_file_name, k)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "4c1607ca",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"24687809"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"counter"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "d9262187",
"metadata": {},
"outputs": [],
"source": [
"def reader(hdf5_path):\n",
" \n",
" with h5py.File(hdf5_path, 'r') as hdf5:\n",
" for key in tqdm(hdf5):\n",
" data = hdf5[key]\n",
"\n",
" n_images = data['coordinates'].shape[0]\n",
"\n",
" for ni in range(n_images):\n",
" atoms = AtomicConfiguration(\n",
" symbols=''.join(data['species']),\n",
" positions=data['coordinates'][ni]\n",
" )\n",
"\n",
" atoms.info['_name'] = file_name+data['path']\n",
"\n",
" atoms.info['energy'] = data['energies'][ni]\n",
" atoms.info['smiles'] = ''.join(data['smiles'])\n",
"\n",
" yield atoms\n",
"\n",
" # High-energy structures were separated out\n",
" n_images = data['coordinatesHE'].shape[0]\n",
"\n",
" for ni in tqdm(range(n_images)):\n",
" atoms = AtomicConfiguration(\n",
" symbols=''.join(data['species']),\n",
" positions=data['coordinatesHE'][ni]\n",
" )\n",
"\n",
" atoms.info['_name'] = file_name+data['path']\n",
" atoms.info['_labels'] = ['high_energy']\n",
"\n",
" atoms.info['energy'] = data['energiesHE'][ni]\n",
" atoms.info['smiles'] = ''.join(data['smiles'])\n",
"\n",
" yield atoms"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cfe3b588",
"metadata": {},
"outputs": [],
"source": [
"ids = list(database.insert_data(\n",
" configurations,\n",
" property_map=property_map,\n",
" generator=False,\n",
" transform=tform,\n",
" verbose=True\n",
"))\n",
"\n",
"all_co_ids, all_pr_ids = list(zip(*ids))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "72e9a1d5",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -847,7 +1109,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.8.11"
}
},
"nbformat": 4,
Expand Down
Loading