datalad-course/html/nhr_2025_datalad.html
Adina Wagner 17bcefc9bc fix link
2025-04-13 15:38:14 +02:00

1133 lines
48 KiB
HTML

<!doctype html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
<!-- Edit me start! -->
<title>DataLad @ NHR </title>
<meta name="description" content="Decentral Management of Digital Objects for Open Science">
<meta name="author" content="Adina Wagner">
<!-- Edit me end! -->
<link rel="stylesheet" href="../reveal.js/dist/reset.css">
<link rel="stylesheet" href="../reveal.js/dist/reveal.css">
<link rel="stylesheet" href="../reveal.js/dist/theme/beige.css">
<link rel="stylesheet" href="../css/main.css">
<!-- Theme used for syntax highlighted code -->
<link rel="stylesheet" href="../reveal.js/plugin/highlight/monokai.css">
</head>
<body>
<div class="reveal">
<div class="slides">
<section>
<section>
<h2>DataLad</h2>
<h3>Decentralized Management of Digital Objects for Open Science</h3>
<div style="margin-top:1em;text-align:center">
<table style="border: none;">
<tr>
<td style="border: none;">Dr. Adina Wagner
<br><small>
<a href="https://mas.to/@adswa" target="_blank">
<img data-src="../pics/mastodon.svg" style="height:30px;margin:0px" />
mas.to/@adswa</a></small></td>
<td style="border: none;">
<br></td>
</tr>
<tr>
<td style="border: none; vertical-align:top">
<small>
<br> Institute of Neuroscience and
Medicine, Brain &amp; Behavior (INM-7)<br>
Research Center Jülich</small><br>
</td>
<td><img style="height:100px;margin-right:10px" data-src="../pics/fzj_logo.png" /></td>
</tr>
</table>
</div>
<!-- <p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:0px;margin-bottom:100px;margin-left:1000px">
<img src="../pics/qr_nhr.png" height="200">
</p>-->
<br><br><small>
Slides: <a href="https://doi.org/10.5281/zenodo.15193934" target="_blank">
DOI 10.5281/zenodo.15193934</a> (Scan the QR code) <br>
<a href="https://files.inm7.de/adina/talks/html/nhr_2025_datalad.html" target="_blank">
files.inm7.de/adina/talks/html/nhr_2025_datalad.html</a></small>
</small>
</a>
</section>
<section>
<h2>Acknowledgements</h2>
<table>
<tr style="vertical-align:middle">
<td style="vertical-align:middle">
<dl>
<dt style="margin-top:20px">DataLad software <br>
& ecosystem</dt>
<dd style="margin-left:5px!important">
<ul style="margin-left:5px!important">
<li>Psychoinformatics Lab, <br>
Research Centre Jülich</li>
<li>Center for Open <br>
Neuroscience, <br>
Dartmouth College</li>
<li>Joey Hess (git-annex)</li>
<li><em>>100 additional contributors</em></li>
</ul>
</dd>
</td>
<td style="vertical-align:middle">
<div style="margin-bottom:-20px;text-align:center"><strong>Funders</strong></div>
<img style="height:150px;margin-right:50px" data-src="../pics/nsf.png" />
<img style="height:150px;margin-right:50pxi;margin-left:50px" data-src="../pics/binc.png" />
<img style="height:150px;margin-left:50px" data-src="../pics/bmbf.png" />
<div style="margin-top:-20px">
<img style="height:80px;margin-top:-40px;margin-left:40px" data-src="../pics/fzj_logo.svg" />
<img style="height:60px;margin-left:50px;margin-bottom:25px" data-src="../pics/dfg_logo.png" />
</div>
<div style="margin-top:-20px">
<img style="height:60px;margin-right:20px" data-src="../pics/erdf.png" />
<img style="height:60px;margin-right:20px" data-src="../pics/cbbs_logo.png" />
<img style="height:60px" data-src="../pics/LSA-Logo.png" />
</div>
<div style="margin-top:40px;margin-bottom:20px;text-align:center"><strong>Collaborators</strong></div>
<div style="margin-top:-20px">
<img style="height:100px;margin:20px" data-src="../pics/hbp_logo.png" />
<img style="height:100px;margin:20px" data-src="../pics/conp_logo.png" />
<img style="height:120px;margin:10px" data-src="../pics/openneuro_logo.png" />
</div>
<div style="margin-top:-40px">
<img style="height:100px;margin:20px" data-src="../pics/ebrains-logo.png"/>
<img style="height:100px;margin:0px" data-src="../pics/gin-logo.png" />
<img style="height:120px;margin:10px" data-src="../pics/sfb1451_logo.png" />
</div>
<div style="margin-top:-40px;align:middle">
<img style="height:140px;margin:10px" data-src="../pics/brainlife_logo.png" />
<img style="height:100px;margin:0px" data-src="../pics/cbrain_logo.png" />
<img style="height:100px;margin:20px" data-src="../pics/vbc_logo.png" />
</div>
</td>
</tr>
</table>
</section>
</section>
<!-- Data Management and DataLad -->
<section>
<!-- Show of hands who has seen this image.
What is it that people hint at when they show this image? (Git)
-->
<section data-transition="None">
<h3 class="fragment fade-in" data-fragment-index="1">The building blocks of a scientific result are rarely static</h3>
<table>
<tr>
<div class="r-stack">
<p class="fragment fade-in-then-out" data-fragment-index="2">Mar 2019</p>
<p class="fragment fade-in-then-out" data-fragment-index="3">Spring 2019</p>
<p class="fragment fade-in-then-out" data-fragment-index="4">July 2019</p>
<p class="fragment fade-in-then-out" data-fragment-index="5">Dec 2019</p>
<!-- <p class="fragment fade-in-then-out" data-fragment-index="6">Mar 2025 <br>
<small> <a href="https://www.404media.co/nih-archives-repositories-marked-for-review-for-potential-modification/" target="_blank">
www.404media.co/nih-archives-repositories-marked-for-review-for-potential-modification</a> </small></p>-->
</div>
</tr>
<tr>
<div class="r-stack">
<img class="fragment fade-out" data-fragment-index="2" src="../pics/phd052810s.png" height="700">
<img class="fragment fade-in-then-out" data-fragment-index="2" src="../pics/abcd_data_issues1.png">
<img class="fragment fade-in-then-out" data-fragment-index="3" src="../pics/abcd_data_issues2.png">
<img class="fragment fade-in-then-out" data-fragment-index="4" src="../pics/abcd_data_issues3.png">
<img class="fragment fade-in-then-out" data-fragment-index="5" src="../pics/abcd_data_issues4.png">
<img class="fragment fade-in-then-out" data-fragment-index="6" src="../pics/frontend_vs_backend_paper.png">
<!-- <img class="fragment fade-in-then-out" data-fragment-index="6" src="../pics/nda_review.png"> -->
</div>
<imgcredit class="fragment fade-out" data-fragment-index="1">Piled Higher and Deeper
<a href="https://phdcomics.com/comics/archive_print.php?comicid=1323" target="_blank">
1323
</a> </imgcredit></td>
</tr>
</table>
<div class="r-stack">
<p style="vertical-align:middle" class="fragment fade-in-then-out" data-fragment-index="1"><u>Data</u> changes <br>
<small>(errors are fixed, data is extended,<br>
naming standards change, an analysis <br>
requires only a subset of your data...)</small></p>
<p style="vertical-align:middle" class="fragment fade-in" data-fragment-index="2">
<small>source: <a href="source: https://abcdstudy.org/scientists/data-sharing-archive" target="_blank">abcdstudy.org/scientists/data-sharing-archive</a> </small></p>
</div>
</section>
</section>
<section>
<section>
<img style="height:300px; margin-top: 0; margin-right:1px;vertical-align:middle;" src="../pics/datalad_logo_wide.svg" alt="">
<br>
<ul style="font-size:37px">
<li>Domain-agnostic <strong>command-line tool</strong>
(+ <strong>graphical user interface</strong>),
built on top of <a href="https://git-scm.com/" target="_blank">Git</a>
& <a href="https://git-annex.branchable.com/" target="_blank">Git-annex</a></li>
<li>Open source (MIT) research software developed since 2013</li>
<li>Available for all major operating systems</li>
<li>Major features:</li>
<dt>Version-controlling arbitrarily large content </dt>
<dd>Version control data & software alongside to code!</dd>
<dt>Transport mechanisms for sharing & obtaining data </dt>
<dd>Consume & collaborate on data (analyses) like software</dd>
<dt>(Computationally) reproducible data analysis</dt>
<dd>Track and share provenance of all digital objects</dd>
<dt>(... and <i>much</i> more) </dt>
<br>
</ul>
</section>
<section data-transition="None">
<img src="../pics/vamp_0_start.png"><br><br>
A DataLad dataset is a joint Git/git-annex repository that can version control any file
<br><br>
<table width=100% style="padding:0px">
<tr><td style="padding:0px">
<code><pre>
# turn any directory into a dataset
# with version control
% datalad create &lt;directory&gt;
</pre></code>
</td><td style="padding:0px">
<code><pre>
# save a new state of a dataset with
# file content of any size
% datalad save
</pre></code>
</td></tr></table>
</section>
<section data-transition="None">
<img src="../pics/vamp_1_provcapture.png">
<br><br>
Which data (at which version), with which code, running with what parameterization in which
computational environment, to generate what?<br><br>
<table width=100% style="padding:0px">
<tr><td style="padding:0px">
<code><pre>
# execute any command and capture its output
# while recording all input versions too
% datalad run --input ... --output ... &lt;command&gt;
</pre></code>
</td></tr></table>
</section>
<section data-transition="None">
<img src="../pics/vamp_2_pushtocloud.png">
<br><br>
Decentral data transport to Git hosting, local or remote infrastructure, or external hosting services
<br><br>
<table width=100% style="padding:0px">
<tr><td style="padding:0px">
<code><pre>
# transfer data and metadata to other sites and services
# with fine-grained access control for dataset components
% datalad push --to &lt;site-or-service&gt;
</pre></code>
</td></tr></table>
</section>
<section data-transition="None">
<img src="../pics/vamp_3_reproduce.png">
<br><br>
Outcomes can be validated. This enables audits, promotes accountability, and streamlines automated "upgrades" of outputs
<br><br>
<table width=100% style="padding:0px">
<tr><td style="padding:0px">
<code><pre>
# obtain dataset (initially only identity,
# availability, and provenance metadata)
% datalad clone &lt;url&gt;
</pre></code>
</td><td style="padding:0px">
<code><pre>
# immediately actionable provenance records
# full abstraction of input data retrieval
% datalad rerun &lt;commit|tag|range&gt;
</pre></code>
</td></tr></table>
</section>
<section data-transition="None">
<img src="../pics/vamp_4_reuse.png">
<br>Datasets can be (re-)used as modular components in larger contexts — propagating
their traits. They are verifiable, portable, self-contained data structures
<br><br>
<table width=100% style="padding:0px">
<tr><td style="padding:0px">
<code><pre>
# declare a dependency on another dataset and
# re-use it a particular state in a new context
% datalad clone -d &lt;superdataset&gt; &lt;url&gt; &lt;path-in-dataset&gt;
</pre></code>
</td></tr></table>
</section>
<section>
<h2>Version control beyond text files</h2>
<p class="fragment fade-in" data-fragment-index="2">
<img class="fragment fade-in" data-fragment-index="2" src="../pics/gitannex.png" height="100px">
Using <a href="https://git-annex.branchable.com" target="_blank">git-annex</a>,
<a href="https://datalad.org" target="_blank">DataLad</a> version controls large data
<img class="fragment fade-in" data-fragment-index="2" src="../pics/datalad_logo_wide.svg" height="100px"></p>
<div class="r-stack">
<img class="fragment fade-in" height="500" data-fragment-index="3" src="../pics/tigdata.png">
<img class="fragment fade-in" height="500" data-fragment-index="4" src="../pics/tigdata3.png">
<img class="fragment fade-in" height="500" data-fragment-index="5" src="../pics/tigdata2.png">
</div>
</section>
<section data-transition="None" style="font-size:35px">
<h2>Version control beyond text files</h2>
<ul>
<li>Datasets have an <b>annex</b> to track files without
placing their content into Git</li>
<li>Rather than content, <strong>identity</strong> (hash) and <strong>location</strong> information is put into Git:</li>
<ul>
<li class="fragment fade-in" data-fragment-index="0">Where the filesystem allows it, annexed files are symlinks:</li>
</ul>
</ul>
<pre class="fragment fade-in" data-fragment-index="0"><code class="fragment fade-in;language-bash" style="max-width:none" data-fragment-index="0">$ ls -l sub-02/func/sub-02_task-oneback_run-01_bold.nii.gz
lrwxrwxrwx 1 adina adina 142 Jul 22 19:45 sub-02/func/sub-02_task-oneback_run-01_bold.nii.gz ->
../../.git/annex/objects/kZ/K5/MD5E-s24180157--aeb0e5f2e2d5fe4ade97117a8cc5232f.nii.gz/MD5E-s24180157
--aeb0e5f2e2d5fe4ade97117a8cc5232f.nii.gz
</code></pre><small class="fragment fade-in" data-fragment-index="0">(PS: especially useful in datasets with many identical files) </small>
<ul><ul>
<li class="fragment fade-in" data-fragment-index="1">The symlink reveals: This internal data organization based on identity hash</li>
</ul>
</ul>
<pre class="fragment fade-in" data-fragment-index="1"><code class="fragment fade-in;language-bash" data-fragment-index="1">$ md5sum sub-02/func/sub-02_task-oneback_run-01_bold.nii.gz
aeb0e5f2e2d5fe4ade97117a8cc5232f sub-02/func/sub-02_task-oneback_run-01_bold.nii.gz
</code></pre>
<ul><ul>
<li class="fragment fade-in" data-fragment-index="2">The (tiny) symlink instead of the (potentially large) file content is
committed - version controlling precise file identity without checking contents into Git
<img src="../pics/annex-commit.png"></li>
<li class="fragment fade-in" data-fragment-index="3">File availability information is stored to
record a decentral network of file content.
A file can exist in multiple different locations.</li>
</ul></ul>
<pre class="fragment fade-in" data-fragment-index="3"><code class="fragment fade-in;language-bash" data-fragment-index="1">$ git annex whereis sub-02/func/sub-02_task-oneback_run-01_bold.nii.gz
whereis sub-02/func/sub-02_task-oneback_run-01_bold.nii.gz (2 copies)
8c3680dd-6165-4749-adaa-c742232bc317 -- git@8242caf9acd8:/data/repos/adswa/bidsdata.git [gin]
fff8fdbc-3185-4b78-bd12-718717588442 -- adina@muninn:~/bids-data [here]
ok
</code></pre>
</section>
<section>
<h2>Git versus Git-annex</h2>
<dl>
<dt>Data in datasets is either stored in Git or git-annex</dt>
<dd>By default, everything is <i>annexed</i>, i.e., stored in a dataset annex</dd>
</dl>
<img height="400" src="../pics/artwork/src/publishing/publishing_gitvsannex.svg">
<small>
<table>
<tr>
<td><b>Git</b></td>
<td><b>git-annex</b></td>
</tr>
<tr>
<td>handles <b>small</b> files well (text, code)</td>
<td>handles <b>all</b> types and sizes of files well</td>
</tr>
<tr>
<td>file contents are in the Git history
and will be <b>shared</b> upon git/datalad push</td>
<td>file contents are in the annex. Not necessarily shared</td>
</tr>
<tr>
<td>Shared with every dataset clone</td>
<td><b>Can be kept private</b> on a per-file level when sharing the dataset</td>
</tr>
<tr>
<td>Useful: Small, non-binary, frequently modified, need-to-be-accessible (DUA, README) files </td>
<td>Useful: Large files, private files</td>
</tr>
</table>
</small>
</section>
</section>
<section>
<section>
<h2>(Raw) data mismanagement</h2>
<ul>
<li>Multiple large datasets are available on a compute cluster 🏞 </li>
<li>Each researcher creates their own copies of data ⛰ </li>
<li>Multiple different derivatives and results are computed from it 🏔</li>
<li>Data, copies of data, half-baked data transformations, results, and
old versions of results are kept - undocumented 🌋 </li>
</ul>
</section>
<section data-transition="None">
<h2>Share data like source code</h2>
<div class="r-stack">
<img class="fragment fade-in-then-out" data-fragment-index="0" src="../pics/centralmanagement2.gif" alt="a screenrecording of cloning an institutional superdataset from GitLab">
<img class="fragment fade-in" data-fragment-index="1" style="box-shadow: 5px 5px 3px #888888" height="330" src="../pics/artwork/src/collaboration.svg">
</div>
<aside class="notes">
Idea behind datalad: Enable a similar level of tooling and culture for the distribution and version control of data as it is present for open source software development
</aside>
</section>
<section>
<h3>Transport logistics: Lots of data, little disk-usage</h3>
<ul>
<li class="fragment fade-in">
Cloned datasets are lean.
"Meta data" (file names, availability) are present, but <b>no file content</b>:</li>
<pre class="fragment fade-in"><code data-trim class="language-bash" onmousemove="showHover(event)" onmousedown="clickCopy(event)" onmouseleave="leaveElement(event)">$ datalad clone git@github.com:psychoinformatics-de/studyforrest-data-phase2.git
install(ok): /tmp/studyforrest-data-phase2 (dataset)
$ cd studyforrest-data-phase2 && du -sh
18M .</code></pre>
<li class="fragment fade-in">
files' contents can be retrieved on demand:
</li>
</ul>
<pre class="fragment fade-in"><code data-trim class="language-bash" onmousemove="showHover(event)" onmousedown="clickCopy(event)" onmouseleave="leaveElement(event)">$ datalad get sub-01/ses-movie/func/sub-01_ses-movie_task-movie_run-1_bold.nii.gz
get(ok): /tmp/studyforrest-data-phase2/sub-01/ses-movie/func/
sub-01_ses-movie_task-movie_run-1_bold.nii.gz (file) [from mddatasrc...]</code></pre>
<ul>
<li class="fragment fade-in">Have access to more data on your computer than you have disk-space:</li>
<pre class="fragment fade-in"><code># eNKI dataset (1.5TB, 34k files):
$ du -sh
1.5G .
# HCP dataset (~200TB, >15 million files)
$ du -sh
48G . </code></pre>
</ul>
</section>
<section data-markdown data-transition="None"> <script type="text/template">
## Plenty of data, but little disk-usage
Drop file content that is not needed:<!-- .element: class="fragment fade-in" -->
<pre class="fragment fade-in"><code data-trim class="language-bash">$ datalad drop sub-01/ses-movie/func/sub-01_ses-movie_task-movie_run-1_bold.nii.gz
drop(ok): /[...]/sub-01_ses-movie_task-movie_run-1_bold.nii.gz (file)
</code></pre>
Only "meta data" stays behind, and files can be re-obtained on demand. This allows for disk-space-aware computing workflows:<!-- .element: class="fragment fade-in" -->
<pre><code class="python">dl.get('input/sub-01')
[really complex analysis]
dl.drop('input/sub-01')
</code></pre><!-- .element: class="fragment fade-in" -->
</script></section>
<section data-transition="None">
<h2>Publishing datasets</h2>
<ul>
Publish datasets, their annexed contents, or both to infrastructure of your choice
</ul>
<div class="r-stack">
<img data-fragment-index="0" height="600" src="../pics/artwork/src/publishing/publishing_network_gitvsannex.svg">
</div>
</section>
<section data-transition="None">
<h2>Interoperability</h2>
<ul>
<li>DataLad is built to maximize interoperability and streamline routines across hosting and
storage technology</li>
</ul>
<img src="../pics/services_connected.png" height="650">
</section>
</section>
<section>
<!-- on modularity -->
<section data-markdown><script type="text/template">
## Modularity
![](../pics/submodule_setup.svg)<!-- .element: height="500" -->
- Typical workflow in science
- Prior works (algorithm development, empirical data, etc.) are combined
to produce novel results with to goal of a publication
- **Aggregation across time and contributors**
- Aiming for (but often failing) to be reproducible
</script>
</section>
<section data-markdown><script type="text/template">
## Version control beyond single repositories
- **Why** are multiple repositories needed (in science)?
- Size impacts I/O and logistics
- Git can struggle with 1M+ files or 100k+ commits
- Filesystems (licensing) can struggle with large numbers of inodes
- Target audience is different
- Public vs. private or personal vs. anonymized data
- Pace of evolution or access patterns are different
- "Factual" raw data vs. choices of (pre-)processing
- Completed acquisition vs. ongoing study
![](../pics/dataflow.jpg)<!-- .element: height="200" -->
- A **single repository is not enough**, but Git/Git-annex are not optimized
for such use cases
</script>
</section>
<section data-transition="None">
<h2>Dataset Nesting</h2>
<ul>
<li>Seamless nesting mechanisms:
<img height="330" src="../pics/artwork/src/linkage_subds.svg">
<ul>
<li>hierarchies of datasets in super-/sub-dataset relationships</li>
<li>based on Git submodules, but more seamless: Mono-repo feel thanks to recursive operations</li>
</ul>
<li class="fragment fade-in" data-fragment-index="2">Overcomes scaling issues with large amounts of files</li>
<pre class="fragment fade-in" data-fragment-index="2"><code>adina@bulk1 in /ds/hcp/super on git:master❱ datalad status --annex -r
15530572 annex'd files (77.9 TB recorded total size)
nothing to save, working tree clean</code></pre>
<small><a class="fragment fade-in" data-fragment-index="2" href="https://github.com/datalad-datasets/human-connectome-project-openaccess" target="_blank">(github.com/datalad-datasets/human-connectome-project-openaccess)</a></small>
<li class="fragment fade-in">Modularizes research components for transparency, reuse, and access management</li>
</ul>
</section>
<section data-transition="None">
<h2>Intuitive data analysis structure</h2>
<li>You can link datasets together in superdataset-subdataset hierarchies:</li>
<img src="../pics/artwork/src/linkage_subds.svg" width="900"> <br>
<pre><code style="max-width:none" class="bash" data-line-numbers="1,3, 6">$ cd myanalysis
# we can install analysis input data as a subdataset to the dataset
$ datalad clone -d . https://github.com/datalad-handbook/iris_data.git input/
[INFO ] Scanning for unlocked files (this may take some time)
[INFO ] Remote origin not usable by git-annex; setting annex-ignore
install(ok): input (dataset)
add(ok): input (file)
add(ok): .gitmodules (file)
save(ok): . (dataset)
action summary:
add (ok: 2)
install (ok: 1)
save (ok: 1)
</code></pre>
</section>
</section>
<section>
<section data-transition="None">
<h2>Leaving a trace </h2>
<p>"Shit, which version of which script produced these outputs from which version
of what data?"</p>
<p>
"Shit, why buttons did I click and in which order did I use all those tools?"</p>
<br>
<p>
<img src="../pics/manuallabor.png">
<img src="../pics/findfiles.png" height="400">
<img src="../pics/projectstack.png" height="350">
<imgcredit>CC-BY Scriberia and <a href="https://the-turing-way.netlify.app/reproducible-research/rdm.html" target="_blank">
The Turing Way</a>
</imgcredit>
</p>
</section>
<section data-transition="None">
<h2>Leaving a trace</h2>
<p class="fragment" data-fragment-index="1"> <strong>datalad run</strong> wraps around anything expressed in a command
line call and saves the dataset modifications resulting from the execution.</p>
<p class="fragment" data-fragment-index="2"> <strong>datalad rerun</strong> repeats captured executions.
If the outcomes
differ, it saves a new state of them.</p>
<p class="fragment" data-fragment-index="3"> <strong>datalad containers-run</strong> executes command
line calls inside a tracked software container and saves the dataset modifications resulting from the execution.</p>
<div class="r-stack">
<img class="fragment fade-in-then-out" data-fragment-index="1" src="../pics/run_basic.svg" height="350">
<img class="fragment fade-in-then-out" data-fragment-index="2" src="../pics/rerun.svg" height="350">
<img class="fragment fade-in" data-fragment-index="3" src="../pics/containers-run_basic.svg" height="350">
</div>
</section>
<section data-transition="None">
<h2>data analysis provenance</h2>
<p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:60px;margin-bottom:-60px">
Enshrine the analysis in a script
</p>
<p class="fragment fade-in" style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:180px;margin-bottom:-60px">
Here: extract_lc_timeseries.py
</p>
<p style="z-index: -1">
<pre><code class="bash" style="max-height:none" data-line-numbers="6">$ datalad containers-run \
--message "Time series extraction from Locus Coeruleus"
--container-name nilearn \
--input 'mri/*_bold.nii' \
--output 'sub-*/LC_timeseries_run-*.csv' \
"python3 code/extract_lc_timeseries.py"
-- Git commit --
commit 5a7565a640ff6de67e07292a26bf272f1ee4b00e
Author: Adina Wagner adina.wagner@t-online.de
AuthorDate: Mon Nov 11 16:15:08 2019 +0100
Commit: Adina Wagner adina.wagner@t-online.de
CommitDate: Mon Nov 11 16:15:08 2019 +0100
[DATALAD RUNCMD] Time series extraction from Locus Coeruleus
=== Do not change lines below ===
{
"cmd": "singularity exec --bind {pwd} .datalad/environments/nilearn.simg bash..",
"dsid": "92ea1faa-632a-11e8-af29-a0369f7c647e",
"inputs": [
"mri/*.bold.nii.gz",
".datalad/environments/nilearn.simg"
],
"outputs": ["sub-*/LC_timeseries_run-*.csv"],
...
}
^^^ Do not change lines above ^^^
---
sub-01/LC_timeseries_run-1.csv | 1 +
...
</code></pre>
</p>
</section>
<section data-transition="None">
<h2>data analysis provenance</h2>
<p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:130px;margin-bottom:-60px;margin-left:750px">
Record code execution together <br> with
input-data, output files and software
environment in the
execution-command
</p>
<p style="z-index: -1">
<pre><code class="bash" style="max-height:none" data-line-numbers="1-6">$ datalad containers-run \
--message "Time series extraction from Locus Coeruleus"
--container-name nilearn \
--input 'mri/*_bold.nii' \
--output 'sub-*/LC_timeseries_run-*.csv' \
"python3 code/extract_lc_timeseries.py"
-- Git commit --
commit 5a7565a640ff6de67e07292a26bf272f1ee4b00e
Author: Adina Wagner adina.wagner@t-online.de
AuthorDate: Mon Nov 11 16:15:08 2019 +0100
Commit: Adina Wagner adina.wagner@t-online.de
CommitDate: Mon Nov 11 16:15:08 2019 +0100
[DATALAD RUNCMD] Time series extraction from Locus Coeruleus
=== Do not change lines below ===
{
"cmd": "singularity exec --bind {pwd} .datalad/environments/nilearn.simg bash..",
"dsid": "92ea1faa-632a-11e8-af29-a0369f7c647e",
"inputs": [
"mri/*.bold.nii.gz",
".datalad/environments/nilearn.simg"
],
"outputs": ["sub-*/LC_timeseries_run-*.csv"],
...
}
^^^ Do not change lines above ^^^
---
sub-01/LC_timeseries_run-1.csv | 1 +
...
</code></pre>
</p>
</section>
<section data-transition="None">
<h2>data analysis provenance</h2>
<p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:60px;margin-bottom:-60px;margin-left:200px">
Result: machine readable record about which data, code, and <br>
software produced a result how, when, and why.
</p>
<p style="z-index: -1">
<pre><code class="bash" style="max-height:none" data-line-numbers="8-30">$ datalad containers-run \
--message "Time series extraction from Locus Coeruleus"
--container-name nilearn \
--input 'mri/*_bold.nii' \
--output 'sub-*/LC_timeseries_run-*.csv' \
"python3 code/extract_lc_timeseries.py"
-- Git commit --
commit 5a7565a640ff6de67e07292a26bf272f1ee4b00e
Author: Adina Wagner adina.wagner@t-online.de
AuthorDate: Mon Nov 11 16:15:08 2019 +0100
Commit: Adina Wagner adina.wagner@t-online.de
CommitDate: Mon Nov 11 16:15:08 2019 +0100
[DATALAD RUNCMD] Time series extraction from Locus Coeruleus
=== Do not change lines below ===
{
"cmd": "singularity exec --bind {pwd} .datalad/environments/nilearn.simg bash..",
"dsid": "92ea1faa-632a-11e8-af29-a0369f7c647e",
"inputs": [
"mri/*.bold.nii.gz",
".datalad/environments/nilearn.simg"
],
"outputs": ["sub-*/LC_timeseries_run-*.csv"],
...
}
^^^ Do not change lines above ^^^
---
sub-01/LC_timeseries_run-1.csv | 1 +
...
</code></pre>
</p>
</section>
<section data-transition="None">
<h2>data analysis provenance</h2>
<p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:60px;margin-bottom:-60px;margin-left:350px">
Use the unique identifier of the execution record
</p>
<p style="z-index: -1">
<pre><code class="bash" style="max-height:none" data-line-numbers="1">$ datalad rerun 5a7565a640ff6de67
[INFO ] run commit 5a7565a640ff6de67; (Time series extraction from Locus Coeruleus)
[INFO ] Making sure inputs are available (this may take some time)
get(ok): mri/sub-01_bold.nii (file)
get(ok): mri/sub-02_bold.nii (file)
[...]
[INFO ] == Command start (output follows) =====
[INFO ] == Command exit (modification check follows) =====
add(ok): sub-01/LC_timeseries_run-*.csv(file)
add(ok): sub-02/LC_timeseries_run-*.csv (file)
[...]
action summary:
add (ok: 30)
get (ok: 30)
save (ok: 2)
unlock (ok: 30)
</code></pre>
</p>
</section>
<section data-transition="None">
<h2>data analysis provenance</h2>
<p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:400px;margin-bottom:-60px;margin-left:350px">
... to have a machine recompute and verify past work
</p>
<p style="z-index: -1">
<pre><code class="bash" style="max-height:none" data-line-numbers="2-16">$ datalad rerun 5a7565a640ff6de67
[INFO ] run commit 5a7565a640ff6de67; (Time series extraction from Locus Coeruleus)
[INFO ] Making sure inputs are available (this may take some time)
get(ok): mri/sub-01_bold.nii (file)
get(ok): mri/sub-02_bold.nii (file)
[...]
[INFO ] == Command start (output follows) =====
[INFO ] == Command exit (modification check follows) =====
add(ok): sub-01/LC_timeseries_run-*.csv(file)
add(ok): sub-02/LC_timeseries_run-*.csv (file)
[...]
action summary:
add (ok: 30)
get (ok: 30)
save (ok: 2)
unlock (ok: 30)
</code></pre>
</section>
</section>
<section>
<section>
<h2>DataLad for scientific workflows?</h2>
<dl>
<dt class="fragment fade-in-then-semi-out" data-fragment-index="1">Scientific building blocks are not static.</dt>
<dd class="fragment fade-in-then-semi-out" data-fragment-index="2">Version control beyond text</dd>
<dt class="fragment fade-in-then-semi-out" data-fragment-index="3">Science is build from modular units.</dt>
<dd class="fragment fade-in-then-semi-out" data-fragment-index="4">Nesting</dd>
<dt class="fragment fade-in-then-semi-out" data-fragment-index="5">Science is exploratory, iterative, multi-stepped, and complex.</dt>
<dd class="fragment fade-in-then-semi-out" data-fragment-index="6">Provenance</dd>
<dt class="fragment fade-in-then-semi-out" data-fragment-index="7">Science is collaborative.</dt>
<dd class="fragment fade-in-then-semi-out" data-fragment-index="8">Transport logistics</dd>
</dl>
</section>
<section data-transition="None">
<h2>Research data management is tied to reproducibility</h2>
<img src="../pics/fragile.png" height="800">
<imgcredit>Based on <a href="https://xkcd.com/2347/" target="_blank">
xkcd.com/2347/</a> (CC-BY)</imgcredit>
<small><a href="https://www.youtube.com/watch?v=nTVcMDVlyOI" target="_blank">
Reproducibility Management in Neuroscience -
Specific Issues and Solutions</a>
(<a href="https://doi.org/10.5281/zenodo.4285927" target="_blank">DOI 10.5281/zenodo.4285927</a>) </small>
</section>
</section>
<section>
<section data-markdown data-transition="None"><script type="text/template">
## FAIRly big: Scaling up
Objective: Process the UK Biobank (imaging data)
![](../pics/biobank_website.png)<!-- .element: height="400" -->
- 76 TB in 43 million files in total
- 42,715 participants contributed personal health data
- Strict DUA
- Custom binary-only downloader
- Most data records offered as (unversioned) ZIP files
</script></section>
<section data-markdown data-transition="None"><script type="text/template">
## Challenges
- Process data such that
- Results are computationally reproducible (without the original compute infrastructure)
- There is complete linkage from results to an individual data record download
- It scales with the amount of available compute resources
- Data processing pipeline
- Compiled MATLAB blob
- 1h processing time per image, with 41k images to process
- 1.2 M output files (30 output files per input file)
- 1.2 TB total size of outputs
</script></section>
<section data-transition="None">
<h2> FAIRly big setup</h2>
<img src="../pics/fairlybig_ukbsetup.png" width="1200" style="margin-top:-35px;margin-bottom:-30px">
<ul style="font-size:30px">
<strong>Exhaustive tracking</strong>
<li><a href="https://github.com/datalad/datalad-ukbiobank" target="_blank">datalad-ukbiobank</a>
extension downloads, transforms & track the evolution of the complete data release
in DataLad datasets
</li>
<li>Native and BIDSified data layout (at no additional disk space usage)</li>
<li>Structured in 42k individual datasets, combined to one superdataset</li>
<li>Containerized pipeline in a software container</li>
<li>Link input data & computational pipeline as dependencies</li>
</ul>
<br><br>
<small><a href="https://www.nature.com/articles/s41597-022-01163-2" target="_blank">
Wagner, Waite, Wierzba et al. (2021). FAIRly big: A framework for computationally reproducible processing of large-scale data.</a>
</small>
</section>
<section data-transition="None">
<h2>FAIRly big workflow</h2>
<div class="r-stack">
<img class="fragment fade-out" src="../pics/fairlybig_workflow.png" width="1200" style="margin-top:-35px;margin-bottom:-30px">
<img src="../pics/htcondor.svg" class="fragment fade-in">
</div>
<br>
<ul style="font-size:30px">
<strong>portability</strong>
<li>Parallel processing: 1 job = 1 subject
(number of concurrent jobs capped at the capacity of the compute cluster)
</li>
<li>Each job is computed in a ephemeral (short-lived) dataset clone, results are pushed back:
Ensure exhaustive tracking &
portability during computation</li>
<li>Content-agnostic persistent (encrypted) storage (minimizing storage and inodes)</li>
<li>Common data representation in secure environments</li>
</ul>
<br><br>
<small><a href="https://www.nature.com/articles/s41597-022-01163-2" target="_blank">
Wagner, Waite, Wierzba et al. (2021). FAIRly big: A framework for computationally reproducible processing of large-scale data.</a>
</small></section>
<section data-transition="None">
<h2>FAIRly big provenance capture</h2>
<img src="../pics/fairlybig_prov.png" width="1200" style="margin-top:-35px;margin-bottom:-30px">
<br><br>
<ul style="font-size:30px">
<strong>Provenance</strong>
<li>Every single pipeline execution is tracked</li>
<li>Execution in ephemeral workspaces ensures results
individually reproducible without HPC access</li>
</ul>
<br><br>
<small><a href="https://www.nature.com/articles/s41597-022-01163-2" target="_blank">
Wagner, Waite, Wierzba et al. (2021). FAIRly big: A framework for computationally reproducible processing of large-scale data.</a>
</small></section>
<section data-markdown><script type="text/template">
## FAIRly big movie
<iframe width="1120" height="630" src="https://www.youtube-nocookie.com/embed/UsW6xN2f2jc?start=17" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
- Two computations on clusters of different scale (small cluster, supercomputer). Full video: https://youtube.com/datalad
- Two full (re-)computations, programmatically comparable, verifiable, reproducible -- on any system with data access
</script></section>
</section>
<section>
<section>
<h2>Current and future developments</h2>
</section>
<!-- I've shown you this slide already - data changes, often due to mishaps from scientists.
But sometimes, its also political --->
<section data-transition="None">
<h3>The building blocks of a scientific result are <br>
... sometimes unreliable or threatened</h3>
<table>
<tr>
<div class="r-stack">
<p class="fragment fade-in-then-out" data-fragment-index="2">Mar 2019</p>
<!--<p class="fragment fade-in-then-out" data-fragment-index="3">Spring 2019</p>
<p class="fragment fade-in-then-out" data-fragment-index="4">July 2019</p>
<p class="fragment fade-in-then-out" data-fragment-index="5">Dec 2019</p>-->
<p class="fragment fade-in-then-out" data-fragment-index="3">Mar 2025 <br>
<small> <a href="https://www.404media.co/nih-archives-repositories-marked-for-review-for-potential-modification/" target="_blank">www.404media.co/nih-archives-repositories-marked-for-review-for-potential-modification</a> </small></p>
</div>
</tr>
<tr>
<div class="r-stack">
<img class="fragment fade-out" data-fragment-index="2" src="../pics/phd052810s.png" height="700">
<img class="fragment fade-in-then-out" data-fragment-index="2" src="../pics/abcd_data_issues1.png">
<!-- <img class="fragment fade-in-then-out" data-fragment-index="3" src="../pics/abcd_data_issues2.png">
<img class="fragment fade-in-then-out" data-fragment-index="4" src="../pics/abcd_data_issues3.png">
<img class="fragment fade-in-then-out" data-fragment-index="5" src="../pics/abcd_data_issues4.png">
<img class="fragment fade-in-then-out" data-fragment-index="6" src="../pics/abcd_data_issues4.1.png">-->
<img class="fragment fade-in-then-out" data-fragment-index="3" src="../pics/nda_review.png">
</div>
<imgcredit class="fragment fade-out" data-fragment-index="1">Piled Higher and Deeper
<a href="https://phdcomics.com/comics/archive_print.php?comicid=1323" target="_blank">
1323
</a> </imgcredit></td>
</tr>
</table>
<div class="r-stack">
<p style="vertical-align:middle" class="fragment fade-in" data-fragment-index="3"><u>Data</u> changes <br>
<small>Due to presidential executive orders<br>
to remove files mentioning "gender"<br>
</small></p>
</div>
</section>
<!-- Previously, we told PhD students that decentralization saves them time
when -->
<section>
<h2>Freedom? Chose Decentralization</h2>
<ul>
<li>Infrastructure is ephemeral:</li>
<ul>
<li>Change of institutional contracts</li>
<li>Change of affiliations</li>
<li>Geopolitical developments?</li>
</ul>
<li>DataLad datasets are portable</li>
<ul>
<li>Effortless migrations to different Git or data hosting</li>
<li>Versioning allows for integrity checks</li>
</ul>
</ul>
<br><br><br>
<p >Delineation and advantages of decentral versus central RDM:<br><a href="https://doi.org/10.1515/nf-2020-0037" target="_blank">
Hanke et al., (2021). In defense of decentralized research data management</a></>
</section>
<section>
<h2>Going self-hosted with forgejo-aneksajo</h2>
<ul>
<li>Forgejo (<a href="https://forgejo.org" target="_blank">forgejo.org</a>): Fork of Gitea</li>
<li class="fragment fade-in"><a href="https://codeberg.org/forgejo-aneksajo/forgejo-aneksajo" target="_blank">
Forgejo-aneksajo</a>: Forgejo with git-annex support</li>
</ul>
<div class="r-stack">
<img src="../pics/datalad-hub-frontpage.png">
<img class="fragment fade in" src="../pics/naturalistic-imaging-hub.png">
</div>
</section>
<section data-markdown data-transition="none"><script type="text/template">
### Full-stack RDM for independent, interoperable collaborators
![](../pics/forgejo.webp)
![Consortium RDM setup](../pics/consortium_rdm_setup.svg)<!-- .element: width="400" style="margin-top:-20px;margin-bottom:-10px" -->
scale-free organization: consortium, institution, lab, researcher
<div style="float:left;max-width:50%">
<ul>
<li>maximum contributor benefit</li>
<li>self-hostable, independently governed solutions, e.g.,
<a href="https://atris.fz-juelich.de" target="_blank">atris.fz-juelich.de</a>, <a href="https://hub.trr379.de" target="_blank">hub.trr379.de</a> </li>
</ul>
</div>
<div style="float:left;max-width:50%">
<ul>
<li>minimum contributor cost</li>
<li>self-contained contributor scopes, not inheriting complexity of others</li>
</ul>
</div>
</script></section>
<section>
<h2>Development Roadmap</h2>
<img src="../pics/roadmap_2025.png">
</section>
<section style="font-size:45px" data-transition="None" data-background-image="../pics/distribits-teaser-2025.svg"
data-background-size="1800px" data-background-opacity="0.2">
<h1>Join us!</h1>
<ul>
<strong>Distribits 2025</strong>
<li>International conference on technologies for distributed data management</li>
<li>2 day conference plus single-day Hackathon </li>
<li>@ Haus der Universität Düsseldorf</li>
<li>Registration open until May 1st</li>
</ul>
<br><br><br><br>
<h2><a href="https://distribits.live" target="_blank">distribits.live</a> </h2>
</section>
</section>
<section>
<section>
<h2>DataLad contact and more information</h2>
<table>
<tr><td>Website + Demos</td>
<td><a href="http://datalad.org">http://datalad.org</a></td>
</tr><tr><td>Documentation</td>
<td><a href="http://handbook.datalad.org">http://handbook.datalad.org</a></td>
</tr><tr><td>Talks and tutorials</td>
<td><a href="https://youtube.com/datalad">https://youtube.com/datalad</a></td>
</tr><tr><td>Development</td>
<td><a href="http://github.com/datalad">http://github.com/datalad</a></td>
</tr><tr><td>Support</td>
<td><a href="https://matrix.to/#/#datalad:matrix.org">https://matrix.to/#/#datalad:matrix.org</a></td>
</tr><tr><td>Open data</td>
<td><a href="http://datasets.datalad.org">http://datasets.datalad.org</a></td>
</tr>
</tr><tr><td>Mastodon</td>
<td>@datalad@fosstodon.org</td>
</tr>
</table>
</section>
<section data-markdown><script type="text/template">
## Extensive documentation and training materials
![](../pics/cover.svg)<!-- .element: width="700" style="margin-top:-20px;margin-bottom:-10px" -->
https://handbook.datalad.org (or ISBN 979-8857037973)
- **educational materials** on technologies &mdash; **targeting researchers**, not developers (executable paper, student surpervisor workflow,
...)
- handbook on concepts, workflows, and use cases
- **weekly public (virtual) office hour**
Note:
RDM Education is key. Handbook helps people be more productive, yielding more FAIR resources as an outcome, but not as the main goal.
</script></section>
</section>
<section>
<section>
<h1>Thanks!</h1>
<img src="../pics/qr_nhr.png" height="400px">
</section>
</section>
<section>
<!-- BACKUP -->
<section data-markdown><script type="text/template">
## Talk is cheap, show me the code: Git vs. DataLad
<iframe width="1120" height="630" src="https://www.youtube-nocookie.com/embed/Yrg6DgOcbPE" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
https://www.youtube.com/watch?v=Yrg6DgOcbPE
<aside class="notes">
- show git limits: commit a change in a 3rd-level submodule
- show annex limits: get file in a subdataset
- reveal: datalad makes repo-boundaries vanish -- show save -r
</aside>
</script></section>
</section>
</div>
</div>
<script src="../reveal.js/dist/reveal.js"></script>
<script src="../reveal.js/plugin/notes/notes.js"></script>
<script src="../reveal.js/plugin/markdown/markdown.js"></script>
<script src="../reveal.js/plugin/highlight/highlight.js"></script>
<script>
// More info about initialization & config:
// - https://revealjs.com/initialization/
// - https://revealjs.com/config/
Reveal.initialize({
hash: true,
// The "normal" size of the presentation, aspect ratio will be preserved
// when the presentation is scaled to fit different resolutions. Can be
// specified using percentage units.
width: 1280,
height: 960,
// Factor of the display size that should remain empty around the content
margin: 0.3,
// Bounds for smallest/largest possible scale to apply to content
minScale: 0.2,
maxScale: 1.0,
controls: true,
progress: true,
history: true,
center: true,
slideNumber: 'c',
pdfSeparateFragments: true,
pdfMaxPagesPerSlide: 1,
pdfPageHeightOffset: -1,
transition: 'slide', // none/fade/slide/convex/concave/zoom
// Learn about plugins: https://revealjs.com/plugins/
plugins: [ RevealMarkdown, RevealHighlight, RevealNotes ]
});
</script>
</body>
</html>