colabfit · EFuem · Apr 5, 2022 · Apr 5, 2022 · Apr 5, 2022 · Apr 12, 2022
diff --git a/.github/requirements.txt b/.github/requirements.txt
@@ -8,4 +8,5 @@ plotly
 pymongo
 biopython
 pytest
-wheel
+wheel
+django
diff --git a/.gitignore b/.gitignore
@@ -142,5 +142,6 @@ colabfit/api/data/uploads
 .vscode
 
 #pycharm
-.idea
+.idea/
 .iml
+
diff --git a/colabfit/__init__.py b/colabfit/__init__.py
@@ -6,7 +6,7 @@
 HASH_SHIFT = 0
 # HASH_SHIFT = 2**63
 
-ID_FORMAT_STRING= '{}_{:05d}_{:03d}'
+ID_FORMAT_STRING= '{}_{}_{:03d}'
 
 MAX_STRING_LENGTH = 255
 STRING_DTYPE_SPECIFIER = f'S{MAX_STRING_LENGTH}'
@@ -19,6 +19,7 @@
 _PROPSETTINGS_COLLECTION    = 'property_settings'
 _CONFIGSETS_COLLECTION      = 'configuration_sets'
 _DATASETS_COLLECTION        = 'datasets'
+_COUNTERS_COLLECTION        = 'counters'
 
 SHORT_ID_STRING_NAME = 'short-id'
 EXTENDED_ID_STRING_NAME = 'extended-id'

diff --git a/colabfit/examples/ANI-1/ANI-1.ipynb b/colabfit/examples/ANI-1/ANI-1.ipynb
@@ -8,6 +8,268 @@
     "This notebook serves as an example of how to load and manipulate the [ANI-1 dataset](https://github.com/isayev/ANI1_dataset) using a `Dataset` object."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "1953e930",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "import numpy as np\n",
+    "\n",
+    "from colabfit import SHORT_ID_STRING_NAME\n",
+    "\n",
+    "from colabfit.tools.database import MongoDatabase, load_data\n",
+    "from colabfit.tools.property_settings import PropertySettings\n",
+    "from colabfit.tools.configuration import AtomicConfiguration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "4072825e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = MongoDatabase('ani1_rebuild', configuration_type=AtomicConfiguration, nprocs=4, drop_database=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "626c5843",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "my_path_to_pyanitools = '/home/jvita/scripts/colabfit/data/ANI-1_release/'\n",
+    "sys.path.append(my_path_to_pyanitools)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "bf530739",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pyanitools as pya"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e5ee122d",
+   "metadata": {},
+   "source": [
+    "# To do:\n",
+    "\n",
+    "* Merge all of the HDF5 files into a single file, that way you can parallelize over that"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "bdda8c5e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 20.50it/s]\n",
+      "100%|█████████████████████████████████████████████████████████████████████████| 47932/47932 [01:53<00:00, 420.76it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 765.38it/s]\n",
+      "100%|█████████████████████████████████████████████████████████████████████████████| 267/267 [00:00<00:00, 328.24it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 284.94it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████| 61/61 [00:00<00:00, 249.51it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████| 1406/1406 [00:04<00:00, 321.81it/s]\n",
+      "100%|███████████████████████████████████████████████████████████████████████████| 7760/7760 [00:24<00:00, 317.73it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import glob\n",
+    "import h5py\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "master_file_name = '/home/jvita/scripts/colabfit/data/ANI-1_release/merged.h5'\n",
+    "\n",
+    "counter = 0\n",
+    "\n",
+    "with h5py.File(master_file_name, 'w') as merged:\n",
+    "    for file_path in glob.glob('/home/jvita/scripts/colabfit/data/ANI-1_release/ani_*.h5'):\n",
+    "        with h5py.File(file_path, 'r') as hdf5:\n",
+    "            for group_name in hdf5:\n",
+    "                for sub in tqdm(hdf5[group_name]):\n",
+    "                    merged[sub] = h5py.ExternalLink(file_path, os.path.join(group_name, sub))\n",
+    "                    \n",
+    "                    counter += hdf5[group_name][sub]['coordinates'].shape[0]\n",
+    "\n",
+    "                    if 'coordinatesHE' in hdf5[group_name][sub]:\n",
+    "                        counter += hdf5[group_name][sub]['coordinatesHE'].shape[0]\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "dd7ff702",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "57462"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "with h5py.File(master_file_name, 'a') as merged:\n",
+    "    group_keys = list(merged.keys())\n",
+    "    \n",
+    "len(group_keys)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "6eec16e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "random.shuffle(group_keys)\n",
+    "split_keys = [_.tolist() for _ in np.split(np.array(group_keys), 6)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "7da2a0fb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████| 9577/9577 [00:00<00:00, 21296.84it/s]\n",
+      "100%|█████████████████████████████████████████████████████████████████████████| 9577/9577 [00:00<00:00, 26150.91it/s]\n",
+      "100%|█████████████████████████████████████████████████████████████████████████| 9577/9577 [00:00<00:00, 26061.30it/s]\n",
+      "100%|█████████████████████████████████████████████████████████████████████████| 9577/9577 [00:00<00:00, 25939.50it/s]\n",
+      "100%|█████████████████████████████████████████████████████████████████████████| 9577/9577 [00:00<00:00, 25981.56it/s]\n",
+      "100%|█████████████████████████████████████████████████████████████████████████| 9577/9577 [00:00<00:00, 26087.58it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i, keys in enumerate(split_keys):\n",
+    "    split_file_name = f'/home/jvita/scripts/colabfit/data/ANI-1_release/split_{i}.h5'\n",
+    "    \n",
+    "    with h5py.File(split_file_name, 'w') as hdf5:\n",
+    "        for k in tqdm(keys):\n",
+    "            hdf5[k] = h5py.ExternalLink(master_file_name, k)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "4c1607ca",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "24687809"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "counter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "d9262187",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def reader(hdf5_path):\n",
+    "    \n",
+    "    with h5py.File(hdf5_path, 'r') as hdf5:\n",
+    "        for key in tqdm(hdf5):\n",
+    "            data = hdf5[key]\n",
+    "\n",
+    "            n_images = data['coordinates'].shape[0]\n",
+    "\n",
+    "            for ni in range(n_images):\n",
+    "                atoms = AtomicConfiguration(\n",
+    "                    symbols=''.join(data['species']),\n",
+    "                    positions=data['coordinates'][ni]\n",
+    "                )\n",
+    "\n",
+    "                atoms.info['_name'] = file_name+data['path']\n",
+    "\n",
+    "                atoms.info['energy'] = data['energies'][ni]\n",
+    "                atoms.info['smiles'] = ''.join(data['smiles'])\n",
+    "\n",
+    "                yield atoms\n",
+    "\n",
+    "            # High-energy structures were separated out\n",
+    "            n_images = data['coordinatesHE'].shape[0]\n",
+    "\n",
+    "            for ni in tqdm(range(n_images)):\n",
+    "                atoms = AtomicConfiguration(\n",
+    "                    symbols=''.join(data['species']),\n",
+    "                    positions=data['coordinatesHE'][ni]\n",
+    "                )\n",
+    "\n",
+    "                atoms.info['_name'] = file_name+data['path']\n",
+    "                atoms.info['_labels'] = ['high_energy']\n",
+    "\n",
+    "                atoms.info['energy'] = data['energiesHE'][ni]\n",
+    "                atoms.info['smiles'] = ''.join(data['smiles'])\n",
+    "\n",
+    "                yield atoms"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cfe3b588",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ids = list(database.insert_data(\n",
+    "    configurations,\n",
+    "    property_map=property_map,\n",
+    "    generator=False,\n",
+    "    transform=tform,\n",
+    "    verbose=True\n",
+    "))\n",
+    "\n",
+    "all_co_ids, all_pr_ids = list(zip(*ids))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72e9a1d5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -847,7 +1109,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.8.11"
   }
  },
  "nbformat": 4,
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,4 +8,5 @@ plotly @@
     pymongo
     biopython
     pytest
-    wheel
+    wheel
+    django