Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
EXDF-tools
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
dataAnalysis
EXDF-tools
Commits
77ab74ff
Commit
77ab74ff
authored
1 year ago
by
Philipp Schmidt
Browse files
Options
Downloads
Patches
Plain Diff
Parallelize large dataset reads via pasha
parent
6d0fd2bd
No related branches found
No related tags found
1 merge request
!8
Initial performance optimizations for frame selection pilots
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
setup.py
+1
-0
1 addition, 0 deletions
setup.py
src/exdf/write/sd_writer.py
+26
-2
26 additions, 2 deletions
src/exdf/write/sd_writer.py
with
27 additions
and
2 deletions
setup.py
+
1
−
0
View file @
77ab74ff
...
@@ -51,6 +51,7 @@ setup(
...
@@ -51,6 +51,7 @@ setup(
python_requires
=
'
>=3.8
'
,
python_requires
=
'
>=3.8
'
,
install_requires
=
[
install_requires
=
[
'
extra_data>=1.13
'
,
'
extra_data>=1.13
'
,
'
pasha
'
,
# These are pulled in by EXtra-data but listed here for
# These are pulled in by EXtra-data but listed here for
# completeness until they may require pinning.
# completeness until they may require pinning.
...
...
This diff is collapsed.
Click to expand it.
src/exdf/write/sd_writer.py
+
26
−
2
View file @
77ab74ff
...
@@ -18,11 +18,13 @@ from time import perf_counter
...
@@ -18,11 +18,13 @@ from time import perf_counter
import
numpy
as
np
import
numpy
as
np
import
pasha
as
psh
from
extra_data
import
FileAccess
from
extra_data
import
FileAccess
from
.datafile
import
DataFile
,
get_pulse_offsets
from
.datafile
import
DataFile
,
get_pulse_offsets
log
=
getLogger
(
'
exdf.write.SourceDataWriter
'
)
log
=
getLogger
(
'
exdf.write.SourceDataWriter
'
)
psh
.
set_default_context
(
'
processes
'
,
num_workers
=
24
)
class
SourceDataWriter
:
class
SourceDataWriter
:
...
@@ -297,10 +299,11 @@ class SourceDataWriter:
...
@@ -297,10 +299,11 @@ class SourceDataWriter:
for
key
in
iter_index_group_keys
(
keys
,
index_group
):
for
key
in
iter_index_group_keys
(
keys
,
index_group
):
# TODO: Copy by chunk / file if too large
# TODO: Copy by chunk / file if too large
kd
=
sd
[
key
]
start_key
=
perf_counter
()
start_key
=
perf_counter
()
full_data
=
sd
[
key
].
ndarray
(
)
full_data
=
read_keydata
(
kd
)
after_read
=
perf_counter
()
after_read
=
perf_counter
()
masked_data
=
full_data
[
mask
]
masked_data
=
full_data
[
mask
]
...
@@ -308,7 +311,7 @@ class SourceDataWriter:
...
@@ -308,7 +311,7 @@ class SourceDataWriter:
self
.
copy_instrument_data
(
self
.
copy_instrument_data
(
sd
.
source
,
key
,
h5source
.
key
[
key
],
sd
.
source
,
key
,
h5source
.
key
[
key
],
sd
[
key
]
.
train_id_coordinates
()[
mask
],
kd
.
train_id_coordinates
()[
mask
],
masked_data
)
masked_data
)
after_copy
=
perf_counter
()
after_copy
=
perf_counter
()
...
@@ -508,3 +511,24 @@ def mask_index(g, counts, masks_by_train):
...
@@ -508,3 +511,24 @@ def mask_index(g, counts, masks_by_train):
g
[
'
count
'
][:]
=
counts
g
[
'
count
'
][:]
=
counts
return
full_mask
return
full_mask
def
read_keydata
(
kd
):
if
kd
.
nbytes
>
1073741824
:
data
=
psh
.
alloc
(
shape
=
kd
.
shape
,
dtype
=
kd
.
dtype
)
counts
=
kd
.
data_counts
(
labelled
=
False
)
entry_starts
=
np
.
zeros_like
(
counts
)
entry_starts
[
1
:]
=
np
.
cumsum
(
counts
[:
-
1
])
entry_ends
=
entry_starts
+
counts
# Use parallelization for GiB-sized datasets.
def
read_data
(
worker_id
,
index
,
train_id
,
entries
):
data
[
entry_starts
[
index
]:
entry_ends
[
index
]]
=
entries
psh
.
map
(
read_data
,
kd
)
return
data
else
:
# Simple read for small datasets.
return
kd
.
ndarray
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment