Skip to content

Commit fdbbf4f

Browse files
ax3lf-schmitt
andcommitted
HDF5: Empiric for Optimal Chunk Size
This ports a prior empirical algorithm from libSplash to determine an optimal (large) chunk size for an HDF5 dataset based on its datatype and global extent. Original implementation by Felix Schmitt (ZIH, TU Dresden) in libSplash. Original source: - https://github.com/ComputationalRadiationPhysics/libSplash/blob/v1.7.0/src/DCDataSet.cpp - https://github.com/ComputationalRadiationPhysics/libSplash/blob/v1.7.0/src/include/splash/core/DCHelper.hpp Co-authored-by: Felix Schmitt <felix.schmitt@zih.tu-dresden.de>
1 parent 6d4977a commit fdbbf4f

2 files changed

Lines changed: 125 additions & 10 deletions

File tree

include/openPMD/IO/HDF5/HDF5Auxiliary.hpp

Lines changed: 106 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* Copyright 2017-2020 Fabian Koller
1+
/* Copyright 2017-2020 Fabian Koller, Felix Schmitt, Axel Huebl
22
*
33
* This file is part of openPMD-api.
44
*
@@ -27,7 +27,10 @@
2727

2828
#include <hdf5.h>
2929

30+
#include <array>
31+
#include <cstdint>
3032
#include <complex>
33+
#include <map>
3134
#include <stack>
3235
#include <string>
3336
#include <typeinfo>
@@ -304,4 +307,105 @@ concrete_h5_file_position(Writable* w)
304307

305308
return auxiliary::replace_all(pos, "//", "/");
306309
}
307-
} // openPMD
310+
311+
/** Computes the chunk dimensions for a dataset.
312+
*
313+
* Chunk dimensions are selected to create chunks sizes between
314+
* 64KByte and 4MB. Smaller chunk sizes are inefficient due to overhead,
315+
* larger chunks do not map well to file system blocks and striding.
316+
*
317+
* Chunk dimensions are less or equal to dataset dimensions and do
318+
* not need to be a factor of the respective dataset dimension.
319+
*
320+
* @param[in] dims dimensions of dataset to get chunk dims for
321+
* @param[in] typeSize size of each element in bytes
322+
* @return array for resulting chunk dimensions
323+
*/
324+
inline std::vector< hsize_t >
325+
getOptimalChunkDims( std::vector< hsize_t > const dims,
326+
size_t const typeSize )
327+
{
328+
auto const ndims = dims.size();
329+
std::vector< hsize_t > chunk_dims( dims.size() );
330+
331+
// chunk sizes in KiByte
332+
constexpr std::array< size_t, 7u > CHUNK_SIZES_KiB
333+
{{4096u, 2048u, 1024u, 512u, 256u, 128u, 64u}};
334+
335+
size_t total_data_size = typeSize;
336+
size_t max_chunk_size = typeSize;
337+
size_t target_chunk_size = 0u;
338+
339+
// compute the order of dimensions (descending)
340+
// large dataset dimensions should have larger chunk sizes
341+
std::multimap<hsize_t, uint32_t> dims_order;
342+
for (uint32_t i = 0; i < ndims; ++i)
343+
dims_order.insert(std::make_pair(dims[i], i));
344+
345+
for (uint32_t i = 0; i < ndims; ++i)
346+
{
347+
// initial number of chunks per dimension
348+
chunk_dims[i] = 1;
349+
350+
// try to make at least two chunks for each dimension
351+
size_t half_dim = dims[i] / 2;
352+
353+
// compute sizes
354+
max_chunk_size *= (half_dim > 0) ? half_dim : 1;
355+
total_data_size *= dims[i];
356+
}
357+
358+
// compute the target chunk size
359+
for( auto const & chunk_size : CHUNK_SIZES_KiB )
360+
{
361+
target_chunk_size = chunk_size * 1024;
362+
if (target_chunk_size <= max_chunk_size)
363+
break;
364+
}
365+
366+
size_t current_chunk_size = typeSize;
367+
size_t last_chunk_diff = target_chunk_size;
368+
std::multimap<hsize_t, uint32_t>::const_iterator current_index =
369+
dims_order.begin();
370+
371+
while (current_chunk_size < target_chunk_size)
372+
{
373+
// test if increasing chunk size optimizes towards target chunk size
374+
size_t chunk_diff = target_chunk_size - (current_chunk_size * 2u);
375+
if (chunk_diff >= last_chunk_diff)
376+
break;
377+
378+
// find next dimension to increase chunk size for
379+
int can_increase_dim = 0;
380+
for (uint32_t d = 0; d < ndims; ++d)
381+
{
382+
int current_dim = current_index->second;
383+
384+
// increasing chunk size possible
385+
if (chunk_dims[current_dim] * 2 <= dims[current_dim])
386+
{
387+
chunk_dims[current_dim] *= 2;
388+
current_chunk_size *= 2;
389+
can_increase_dim = 1;
390+
}
391+
392+
current_index++;
393+
if (current_index == dims_order.end())
394+
current_index = dims_order.begin();
395+
396+
if (can_increase_dim)
397+
break;
398+
}
399+
400+
// can not increase chunk size in any dimension
401+
// we must use the current chunk sizes
402+
if (!can_increase_dim)
403+
break;
404+
405+
last_chunk_diff = chunk_diff;
406+
}
407+
408+
return chunk_dims;
409+
}
410+
411+
} // namespace openPMD

src/IO/HDF5/HDF5IOHandler.cpp

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "openPMD/IO/HDF5/HDF5IOHandlerImpl.hpp"
2323

2424
#if openPMD_HAVE_HDF5
25+
# include "openPMD/Datatype.hpp"
2526
# include "openPMD/auxiliary/Filesystem.hpp"
2627
# include "openPMD/auxiliary/StringManip.hpp"
2728
# include "openPMD/backend/Attribute.hpp"
@@ -255,21 +256,30 @@ HDF5IOHandlerImpl::createDataset(Writable* writable,
255256
Attribute a(0);
256257
a.dtype = d;
257258
std::vector< hsize_t > dims;
258-
for( auto const& val : parameters.extent )
259+
std::uint64_t num_elements = 1u;
260+
for( auto const& val : parameters.extent ) {
259261
dims.push_back(static_cast< hsize_t >(val));
262+
num_elements *= val;
263+
}
260264

261265
hid_t space = H5Screate_simple(static_cast< int >(dims.size()), dims.data(), dims.data());
262266
VERIFY(space >= 0, "[HDF5] Internal error: Failed to create dataspace during dataset creation");
263267

264-
std::vector< hsize_t > chunkDims;
265-
for( auto const& val : parameters.chunkSize )
266-
chunkDims.push_back(static_cast< hsize_t >(val));
267-
268268
/* enable chunking on the created dataspace */
269269
hid_t datasetCreationProperty = H5Pcreate(H5P_DATASET_CREATE);
270-
herr_t status;
271-
//status = H5Pset_chunk(datasetCreationProperty, chunkDims.size(), chunkDims.data());
272-
//VERIFY(status == 0, "[HDF5] Internal error: Failed to set chunk size during dataset creation");
270+
271+
if( num_elements != 0u )
272+
{
273+
// get chunking dimensions
274+
std::vector< hsize_t > chunk_dims = getOptimalChunkDims(dims, toBytes(d));
275+
276+
// TODO: allow overwrite with user-provided chunk size
277+
//for( auto const& val : parameters.chunkSize )
278+
// chunk_dims.push_back(static_cast< hsize_t >(val));
279+
280+
herr_t status = H5Pset_chunk(datasetCreationProperty, chunk_dims.size(), chunk_dims.data());
281+
VERIFY(status == 0, "[HDF5] Internal error: Failed to set chunk size during dataset creation");
282+
}
273283

274284
std::string const& compression = parameters.compression;
275285
if( !compression.empty() )
@@ -317,6 +327,7 @@ HDF5IOHandlerImpl::createDataset(Writable* writable,
317327
H5P_DEFAULT);
318328
VERIFY(group_id >= 0, "[HDF5] Internal error: Failed to create HDF5 group during dataset creation");
319329

330+
herr_t status;
320331
status = H5Dclose(group_id);
321332
VERIFY(status == 0, "[HDF5] Internal error: Failed to close HDF5 dataset during dataset creation");
322333
status = H5Tclose(datatype);

0 commit comments

Comments
 (0)