datalad-course/html/nhr_2025_datalad.html

<!doctype html>
<html>
	<head>
		<meta charset="utf-8">
		<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">

		<!-- Edit me start! -->
		<title>DataLad @ NHR </title>
		<meta name="description" content="Decentral Management of Digital Objects for Open Science">
		<meta name="author" content="Adina Wagner">
		<!-- Edit me end! -->

		<link rel="stylesheet" href="../reveal.js/dist/reset.css">
		<link rel="stylesheet" href="../reveal.js/dist/reveal.css">
		<link rel="stylesheet" href="../reveal.js/dist/theme/beige.css">
        <link rel="stylesheet" href="../css/main.css">

		<!-- Theme used for syntax highlighted code -->
		<link rel="stylesheet" href="../reveal.js/plugin/highlight/monokai.css">
	</head>
	<body>
		<div class="reveal">
			<div class="slides">

<section>
      <section>
<h2>DataLad</h2>
<h3>Decentralized Management of Digital Objects for Open Science</h3>

  <div style="margin-top:1em;text-align:center">
  <table style="border: none;">
  <tr>
	<td style="border: none;">Dr. Adina Wagner
	  <br><small>
		<a href="https://mas.to/@adswa" target="_blank">
		  <img data-src="../pics/mastodon.svg" style="height:30px;margin:0px" />
		  mas.to/@adswa</a></small></td>
    <td style="border: none;">
	  <br></td>
  </tr>
  <tr>
    <td style="border: none; vertical-align:top">
        <small>
          <br> Institute of Neuroscience and
          Medicine, Brain &amp; Behavior (INM-7)<br>
       Research Center Jülich</small><br>
    </td>
      <td><img style="height:100px;margin-right:10px" data-src="../pics/fzj_logo.png" /></td>
  </tr>
  </table>
  </div>
<!--        <p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:0px;margin-bottom:100px;margin-left:1000px">
        <img src="../pics/qr_nhr.png" height="200">
    </p>-->
<br><br><small>

    Slides: <a href="https://doi.org/10.5281/zenodo.15193934" target="_blank">
    DOI 10.5281/zenodo.15193934</a> (Scan the QR code) <br>
          <a href="https://files.inm7.de/adina/talks/html/nhr_2025_datalad.html" target="_blank">
    files.inm7.de/adina/talks/html/nhr_2025_datalad.html</a></small>
</small>
</a>
</section>


<section>
  <h2>Acknowledgements</h2>
  <table>
  <tr style="vertical-align:middle">
    <td style="vertical-align:middle">
      <dl>
        <dt style="margin-top:20px">DataLad software <br>
            & ecosystem</dt>
        <dd style="margin-left:5px!important">
          <ul style="margin-left:5px!important">
              <li>Psychoinformatics Lab, <br>
              Research Centre Jülich</li>
              <li>Center for Open <br>
              Neuroscience, <br>
              Dartmouth College</li>
              <li>Joey Hess (git-annex)</li>
              <li><em>>100 additional contributors</em></li>
          </ul>
        </dd>
    </td>
    <td style="vertical-align:middle">
  <div style="margin-bottom:-20px;text-align:center"><strong>Funders</strong></div>
  <img style="height:150px;margin-right:50px" data-src="../pics/nsf.png" />
  <img style="height:150px;margin-right:50pxi;margin-left:50px" data-src="../pics/binc.png" />
  <img style="height:150px;margin-left:50px" data-src="../pics/bmbf.png" />
  <div style="margin-top:-20px">
  <img style="height:80px;margin-top:-40px;margin-left:40px" data-src="../pics/fzj_logo.svg" />
  <img style="height:60px;margin-left:50px;margin-bottom:25px" data-src="../pics/dfg_logo.png" />
  </div>
  <div style="margin-top:-20px">
  <img style="height:60px;margin-right:20px" data-src="../pics/erdf.png" />
  <img style="height:60px;margin-right:20px" data-src="../pics/cbbs_logo.png" />
  <img style="height:60px" data-src="../pics/LSA-Logo.png" />
  </div>
  <div style="margin-top:40px;margin-bottom:20px;text-align:center"><strong>Collaborators</strong></div>
  <div style="margin-top:-20px">
  <img style="height:100px;margin:20px" data-src="../pics/hbp_logo.png" />
  <img style="height:100px;margin:20px" data-src="../pics/conp_logo.png" />
  <img style="height:120px;margin:10px" data-src="../pics/openneuro_logo.png" />
  </div>
  <div style="margin-top:-40px">
  <img style="height:100px;margin:20px" data-src="../pics/ebrains-logo.png"/>
  <img style="height:100px;margin:0px" data-src="../pics/gin-logo.png" />
  <img style="height:120px;margin:10px" data-src="../pics/sfb1451_logo.png" />
</div>
  <div style="margin-top:-40px;align:middle">
  <img style="height:140px;margin:10px" data-src="../pics/brainlife_logo.png" />
  <img style="height:100px;margin:0px" data-src="../pics/cbrain_logo.png" />
  <img style="height:100px;margin:20px" data-src="../pics/vbc_logo.png" />
  </div>
  </td>
  </tr>
  </table>
</section>
</section>

    <!-- Data Management and DataLad -->


<section>

    <!-- Show of hands who has seen this image.
    What is it that people hint at when they show this image? (Git)
    -->
<section data-transition="None">

     <h3 class="fragment fade-in" data-fragment-index="1">The building blocks of a scientific result are rarely static</h3>
            <table>
                <tr>
                    <div class="r-stack">
                        <p class="fragment fade-in-then-out" data-fragment-index="2">Mar 2019</p>
                        <p class="fragment fade-in-then-out" data-fragment-index="3">Spring 2019</p>
                        <p class="fragment fade-in-then-out" data-fragment-index="4">July 2019</p>
                        <p class="fragment fade-in-then-out" data-fragment-index="5">Dec 2019</p>
                       <!-- <p class="fragment fade-in-then-out" data-fragment-index="6">Mar 2025 <br>
                       <small> <a href="https://www.404media.co/nih-archives-repositories-marked-for-review-for-potential-modification/" target="_blank">
                       www.404media.co/nih-archives-repositories-marked-for-review-for-potential-modification</a> </small></p>-->
                    </div>
                </tr>
                <tr>
                        <div class="r-stack">
                            <img class="fragment fade-out" data-fragment-index="2" src="../pics/phd052810s.png" height="700">
                            <img class="fragment fade-in-then-out" data-fragment-index="2" src="../pics/abcd_data_issues1.png">
                            <img class="fragment fade-in-then-out" data-fragment-index="3" src="../pics/abcd_data_issues2.png">
                            <img class="fragment fade-in-then-out" data-fragment-index="4" src="../pics/abcd_data_issues3.png">
                            <img class="fragment fade-in-then-out" data-fragment-index="5" src="../pics/abcd_data_issues4.png">
                            <img class="fragment fade-in-then-out" data-fragment-index="6" src="../pics/frontend_vs_backend_paper.png">
                         <!--   <img class="fragment fade-in-then-out" data-fragment-index="6" src="../pics/nda_review.png"> -->
                        </div>
                    <imgcredit class="fragment fade-out" data-fragment-index="1">Piled Higher and Deeper
                        <a href="https://phdcomics.com/comics/archive_print.php?comicid=1323" target="_blank">
                            1323
                        </a> </imgcredit></td>

                </tr>
            </table>

    <div class="r-stack">
                                <p style="vertical-align:middle" class="fragment fade-in-then-out" data-fragment-index="1"><u>Data</u> changes <br>
                        <small>(errors are fixed, data is extended,<br>
                            naming standards change, an analysis <br>
                                requires only a subset of your data...)</small></p>
                        <p style="vertical-align:middle" class="fragment fade-in" data-fragment-index="2">
                        <small>source: <a href="source: https://abcdstudy.org/scientists/data-sharing-archive" target="_blank">abcdstudy.org/scientists/data-sharing-archive</a> </small></p>
    </div>
</section>
</section>

<section>
      <section>
              <img style="height:300px; margin-top: 0; margin-right:1px;vertical-align:middle;" src="../pics/datalad_logo_wide.svg" alt="">
    <br>
                    <ul style="font-size:37px">
                        <li>Domain-agnostic <strong>command-line tool</strong>
                            (+ <strong>graphical user interface</strong>),
            built on top of <a href="https://git-scm.com/" target="_blank">Git</a>
            & <a href="https://git-annex.branchable.com/" target="_blank">Git-annex</a></li>
        <li>Open source (MIT) research software developed since 2013</li>
        <li>Available for all major operating systems</li>
        <li>Major features:</li>
        <dt>Version-controlling arbitrarily large content </dt>
        <dd>Version control data & software alongside to code!</dd>
        <dt>Transport mechanisms for sharing & obtaining data </dt>
        <dd>Consume & collaborate on data (analyses) like software</dd>
        <dt>(Computationally) reproducible data analysis</dt>
        <dd>Track and share provenance of all digital objects</dd>
        <dt>(... and <i>much</i> more) </dt>
        <br>
    </ul>
</section>

    <section data-transition="None">
        <img src="../pics/vamp_0_start.png"><br><br>
    A DataLad dataset is a joint Git/git-annex repository that can version control any file
        <br><br>
<table width=100% style="padding:0px">
<tr><td style="padding:0px">
<code><pre>
# turn any directory into a dataset
# with version control

% datalad create &lt;directory&gt;
</pre></code>
</td><td style="padding:0px">
<code><pre>
# save a new state of a dataset with
# file content of any size

% datalad save
</pre></code>
</td></tr></table>
    </section>
    <section data-transition="None">
        <img src="../pics/vamp_1_provcapture.png">
        <br><br>
        Which data (at which version), with which code, running with what parameterization in which
computational environment, to generate what?<br><br>

<table width=100% style="padding:0px">
<tr><td style="padding:0px">
<code><pre>
# execute any command and capture its output
# while recording all input versions too

% datalad run --input ... --output ... &lt;command&gt;
</pre></code>
</td></tr></table>
    </section>
    <section data-transition="None">
        <img src="../pics/vamp_2_pushtocloud.png">
        <br><br>
        Decentral data transport to Git hosting, local or remote infrastructure, or external hosting services
        <br><br>

<table width=100% style="padding:0px">
<tr><td style="padding:0px">
<code><pre>
# transfer data and metadata to other sites and services
# with fine-grained access control for dataset components

% datalad push --to &lt;site-or-service&gt;
</pre></code>
</td></tr></table>
    </section>

    <section data-transition="None">
        <img src="../pics/vamp_3_reproduce.png">
        <br><br>
Outcomes can be validated. This enables audits, promotes accountability, and streamlines automated "upgrades" of outputs
        <br><br>
<table width=100% style="padding:0px">
<tr><td style="padding:0px">
<code><pre>
# obtain dataset (initially only identity,
# availability, and provenance metadata)

% datalad clone &lt;url&gt;
</pre></code>
</td><td style="padding:0px">
<code><pre>
# immediately actionable provenance records
# full abstraction of input data retrieval

% datalad rerun &lt;commit|tag|range&gt;
</pre></code>
</td></tr></table>
    </section>

    <section data-transition="None">
        <img src="../pics/vamp_4_reuse.png">
        <br>Datasets can be (re-)used as modular components in larger contexts — propagating
their traits. They are verifiable, portable, self-contained data structures
        <br><br>
<table width=100% style="padding:0px">
<tr><td style="padding:0px">
<code><pre>
# declare a dependency on another dataset and
# re-use it a particular state in a new context

% datalad clone -d &lt;superdataset&gt; &lt;url&gt; &lt;path-in-dataset&gt;
</pre></code>
</td></tr></table>
    </section>


<section>
        <h2>Version control beyond text files</h2>
    <p class="fragment fade-in" data-fragment-index="2">
    <img class="fragment fade-in" data-fragment-index="2" src="../pics/gitannex.png" height="100px">
    Using <a href="https://git-annex.branchable.com" target="_blank">git-annex</a>,
    <a href="https://datalad.org" target="_blank">DataLad</a> version controls large data
    <img class="fragment fade-in" data-fragment-index="2" src="../pics/datalad_logo_wide.svg" height="100px"></p>
    <div class="r-stack">
        <img class="fragment fade-in" height="500" data-fragment-index="3" src="../pics/tigdata.png">
        <img class="fragment fade-in" height="500" data-fragment-index="4" src="../pics/tigdata3.png">
        <img class="fragment fade-in" height="500" data-fragment-index="5" src="../pics/tigdata2.png">
    </div>
</section>

<section data-transition="None" style="font-size:35px">
<h2>Version control beyond text files</h2>
<ul>
    <li>Datasets have an <b>annex</b> to track files without
        placing their content into Git</li>
    <li>Rather than content, <strong>identity</strong> (hash) and <strong>location</strong> information is put into Git:</li>
    <ul>
        <li class="fragment fade-in" data-fragment-index="0">Where the filesystem allows it, annexed files are symlinks:</li>
            </ul>
    </ul>
        <pre class="fragment fade-in" data-fragment-index="0"><code class="fragment fade-in;language-bash" style="max-width:none" data-fragment-index="0">$ ls -l sub-02/func/sub-02_task-oneback_run-01_bold.nii.gz
lrwxrwxrwx 1 adina adina 142 Jul 22 19:45 sub-02/func/sub-02_task-oneback_run-01_bold.nii.gz ->
../../.git/annex/objects/kZ/K5/MD5E-s24180157--aeb0e5f2e2d5fe4ade97117a8cc5232f.nii.gz/MD5E-s24180157
--aeb0e5f2e2d5fe4ade97117a8cc5232f.nii.gz
</code></pre><small class="fragment fade-in" data-fragment-index="0">(PS: especially useful in datasets with many identical files) </small>
    <ul><ul>
    <li class="fragment fade-in" data-fragment-index="1">The symlink reveals: This internal data organization based on identity hash</li>
            </ul>
</ul>
        <pre class="fragment fade-in" data-fragment-index="1"><code class="fragment fade-in;language-bash" data-fragment-index="1">$ md5sum sub-02/func/sub-02_task-oneback_run-01_bold.nii.gz
aeb0e5f2e2d5fe4ade97117a8cc5232f  sub-02/func/sub-02_task-oneback_run-01_bold.nii.gz
</code></pre>
    <ul><ul>
    <li class="fragment fade-in" data-fragment-index="2">The (tiny) symlink instead of the (potentially large) file content is
        committed - version controlling precise file identity without checking contents into Git
        <img src="../pics/annex-commit.png"></li>
        <li class="fragment fade-in" data-fragment-index="3">File availability information is stored to
        record a decentral network of file content.
        A file can exist in multiple different locations.</li>
         </ul></ul>
        <pre class="fragment fade-in" data-fragment-index="3"><code class="fragment fade-in;language-bash" data-fragment-index="1">$ git annex whereis sub-02/func/sub-02_task-oneback_run-01_bold.nii.gz
whereis sub-02/func/sub-02_task-oneback_run-01_bold.nii.gz (2 copies)
  	8c3680dd-6165-4749-adaa-c742232bc317 -- git@8242caf9acd8:/data/repos/adswa/bidsdata.git [gin]
   	fff8fdbc-3185-4b78-bd12-718717588442 -- adina@muninn:~/bids-data [here]
ok
</code></pre>
</section>


  <section>
      <h2>Git versus Git-annex</h2>
      <dl>
          <dt>Data in datasets is either stored in Git or git-annex</dt>
          <dd>By default, everything is <i>annexed</i>, i.e., stored in a dataset annex</dd>
          </dl>
        <img height="400" src="../pics/artwork/src/publishing/publishing_gitvsannex.svg">
                    <small>
          <table>
              <tr>
                  <td><b>Git</b></td>
                  <td><b>git-annex</b></td>
              </tr>
              <tr>
                  <td>handles <b>small</b> files well (text, code)</td>
                  <td>handles <b>all</b> types and sizes of files well</td>
              </tr>
              <tr>
                  <td>file contents are in the Git history
                      and will be <b>shared</b> upon git/datalad push</td>
                  <td>file contents are in the annex. Not necessarily shared</td>
              </tr>
              <tr>
                  <td>Shared with every dataset clone</td>
                  <td><b>Can be kept private</b> on a per-file level when sharing the dataset</td>
              </tr>
              <tr>
                  <td>Useful: Small, non-binary, frequently modified, need-to-be-accessible (DUA, README) files </td>
                  <td>Useful: Large files, private files</td>
              </tr>
          </table>
              </small>
  </section>
</section>

<section>

    <section>
    <h2>(Raw) data mismanagement</h2>
    <ul>
        <li>Multiple large datasets are available on a compute cluster 🏞 </li>
        <li>Each researcher creates their own copies of data ⛰ </li>
        <li>Multiple different derivatives and results are computed from it 🏔</li>
        <li>Data, copies of data, half-baked data transformations, results, and
            old versions of results are kept - undocumented 🌋 </li>
    </ul>
</section>

<section data-transition="None">
    <h2>Share data like source code</h2>
    <div class="r-stack">
    <img class="fragment fade-in-then-out" data-fragment-index="0" src="../pics/centralmanagement2.gif" alt="a screenrecording of cloning an institutional superdataset from GitLab">
    <img class="fragment fade-in" data-fragment-index="1" style="box-shadow: 5px 5px 3px #888888"  height="330" src="../pics/artwork/src/collaboration.svg">
    </div>
        <aside class="notes">
Idea behind datalad: Enable a similar level of tooling and culture for the distribution and version control of data as it is present for open source software development
</aside>
</section>


  <section>
    <h3>Transport logistics: Lots of data, little disk-usage</h3>
    <ul>
      <li class="fragment fade-in">
          Cloned datasets are lean.
          "Meta data" (file names, availability) are present, but <b>no file content</b>:</li>
      <pre class="fragment fade-in"><code data-trim class="language-bash" onmousemove="showHover(event)" onmousedown="clickCopy(event)" onmouseleave="leaveElement(event)">$ datalad clone git@github.com:psychoinformatics-de/studyforrest-data-phase2.git
  install(ok): /tmp/studyforrest-data-phase2 (dataset)
$ cd studyforrest-data-phase2 && du -sh
  18M	.</code></pre>

      <li class="fragment fade-in">
          files' contents can be retrieved on demand:
      </li>
    </ul>
      <pre class="fragment fade-in"><code data-trim class="language-bash" onmousemove="showHover(event)" onmousedown="clickCopy(event)" onmouseleave="leaveElement(event)">$ datalad get sub-01/ses-movie/func/sub-01_ses-movie_task-movie_run-1_bold.nii.gz
  get(ok): /tmp/studyforrest-data-phase2/sub-01/ses-movie/func/
           sub-01_ses-movie_task-movie_run-1_bold.nii.gz (file) [from mddatasrc...]</code></pre>

      <ul>
      <li class="fragment fade-in">Have access to more data on your computer than you have disk-space:</li>
      <pre class="fragment fade-in"><code># eNKI dataset (1.5TB, 34k files):
$ du -sh
1.5G	.
# HCP dataset (~200TB, >15 million files)
$ du -sh
48G	. </code></pre>
    </ul>
  </section>

  <section data-markdown data-transition="None"> <script type="text/template">
  ## Plenty of data, but little disk-usage

  Drop file content that is not needed:<!-- .element: class="fragment fade-in" -->
  <pre class="fragment fade-in"><code data-trim class="language-bash">$ datalad drop sub-01/ses-movie/func/sub-01_ses-movie_task-movie_run-1_bold.nii.gz
  drop(ok): /[...]/sub-01_ses-movie_task-movie_run-1_bold.nii.gz (file)
  </code></pre>
  Only "meta data" stays behind, and files can be re-obtained on demand. This allows for disk-space-aware computing workflows:<!-- .element: class="fragment fade-in" -->
<pre><code class="python">dl.get('input/sub-01')
    [really complex analysis]
    dl.drop('input/sub-01')
</code></pre><!-- .element: class="fragment fade-in" -->
  </script></section>


  <section data-transition="None">
      <h2>Publishing datasets</h2>
      <ul>
          Publish datasets, their annexed contents, or both to infrastructure of your choice
      </ul>
      <div class="r-stack">
          <img data-fragment-index="0" height="600" src="../pics/artwork/src/publishing/publishing_network_gitvsannex.svg">
      </div>
  </section>

<section data-transition="None">
    <h2>Interoperability</h2>
    <ul>
        <li>DataLad is built to maximize interoperability and streamline routines across hosting and
            storage technology</li>
    </ul>
    <img src="../pics/services_connected.png" height="650">
</section>

</section>

<section>
        <!-- on modularity -->
<section data-markdown><script type="text/template">
## Modularity

![](../pics/submodule_setup.svg)<!-- .element: height="500" -->

- Typical workflow in science
  - Prior works (algorithm development, empirical data, etc.) are combined
    to produce novel results with to goal of a publication
  - **Aggregation across time and contributors**
  - Aiming for (but often failing) to be reproducible
</script>
</section>

<section data-markdown><script type="text/template">
## Version control beyond single repositories

- **Why** are multiple repositories needed (in science)?

  - Size impacts I/O and logistics
    - Git can struggle with 1M+ files or 100k+ commits
    - Filesystems (licensing) can struggle with large numbers of inodes

  - Target audience is different
    - Public vs. private or personal vs. anonymized data

  - Pace of evolution or access patterns are different
    - "Factual" raw data vs. choices of (pre-)processing
    - Completed acquisition vs. ongoing study
![](../pics/dataflow.jpg)<!-- .element: height="200" -->
- A **single repository is not enough**, but Git/Git-annex are not optimized
  for such use cases

</script>
</section>

<section data-transition="None">
    <h2>Dataset Nesting</h2>

    <ul>
        <li>Seamless nesting mechanisms:
                <img height="330"  src="../pics/artwork/src/linkage_subds.svg">
            <ul>
                <li>hierarchies of datasets in super-/sub-dataset relationships</li>
                <li>based on Git submodules, but more seamless: Mono-repo feel thanks to recursive operations</li>
                </ul>
        <li class="fragment fade-in" data-fragment-index="2">Overcomes scaling issues with large amounts of files</li>
        <pre  class="fragment fade-in" data-fragment-index="2"><code>adina@bulk1 in /ds/hcp/super on git:master❱ datalad status --annex -r
15530572 annex'd files (77.9 TB recorded total size)
nothing to save, working tree clean</code></pre>
        <small><a class="fragment fade-in" data-fragment-index="2" href="https://github.com/datalad-datasets/human-connectome-project-openaccess" target="_blank">(github.com/datalad-datasets/human-connectome-project-openaccess)</a></small>
        <li class="fragment fade-in">Modularizes research components for transparency, reuse, and access management</li>
    </ul>
</section>


<section data-transition="None">
    <h2>Intuitive data analysis structure</h2>

    <li>You can link datasets together in superdataset-subdataset hierarchies:</li>
    <img src="../pics/artwork/src/linkage_subds.svg" width="900"> <br>
            <pre><code style="max-width:none" class="bash" data-line-numbers="1,3, 6">$ cd myanalysis
# we can install analysis input data as a subdataset to the dataset
$ datalad clone -d . https://github.com/datalad-handbook/iris_data.git input/
[INFO   ] Scanning for unlocked files (this may take some time)
[INFO   ] Remote origin not usable by git-annex; setting annex-ignore
install(ok): input (dataset)
add(ok): input (file)
add(ok): .gitmodules (file)
save(ok): . (dataset)
action summary:
  add (ok: 2)
  install (ok: 1)
  save (ok: 1)
</code></pre>
</section>
</section>


<section>

<section data-transition="None">
    <h2>Leaving a trace </h2>
        <p>"Shit, which version of which script produced these outputs from which version
    of what data?"</p>
        <p>
            "Shit, why buttons did I click and in which order did I use all those tools?"</p>
    <br>
        <p>
    <img src="../pics/manuallabor.png">
    <img src="../pics/findfiles.png" height="400">
    <img src="../pics/projectstack.png" height="350">
    <imgcredit>CC-BY Scriberia and <a href="https://the-turing-way.netlify.app/reproducible-research/rdm.html" target="_blank">
        The Turing Way</a>
    </imgcredit>
        </p>
</section>


  <section data-transition="None">
  <h2>Leaving a trace</h2>
  <p class="fragment" data-fragment-index="1"> <strong>datalad run</strong> wraps around anything expressed in a command
  line call and saves the dataset modifications resulting from the execution.</p>
  <p class="fragment" data-fragment-index="2"> <strong>datalad rerun</strong> repeats captured executions.
  If the outcomes
  differ, it saves a new state of them.</p>
  <p class="fragment" data-fragment-index="3"> <strong>datalad containers-run</strong> executes command
  line calls inside a tracked software container and saves the dataset modifications resulting from the execution.</p>

      <div class="r-stack">
      <img class="fragment fade-in-then-out" data-fragment-index="1" src="../pics/run_basic.svg" height="350">
      <img class="fragment fade-in-then-out" data-fragment-index="2" src="../pics/rerun.svg" height="350">
      <img class="fragment fade-in"  data-fragment-index="3" src="../pics/containers-run_basic.svg" height="350">
      </div>
  </section>

<section data-transition="None">
    <h2>data analysis provenance</h2>
                        <p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:60px;margin-bottom:-60px">
                        Enshrine the analysis in a script
                        </p>
                        <p class="fragment fade-in" style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:180px;margin-bottom:-60px">
                         Here: extract_lc_timeseries.py
                        </p>
    <p style="z-index: -1">
<pre><code class="bash" style="max-height:none" data-line-numbers="6">$ datalad containers-run \
  --message "Time series extraction from Locus Coeruleus"
  --container-name nilearn \
  --input 'mri/*_bold.nii' \
  --output 'sub-*/LC_timeseries_run-*.csv' \
  "python3 code/extract_lc_timeseries.py"

-- Git commit --
    commit 5a7565a640ff6de67e07292a26bf272f1ee4b00e
    Author:     Adina Wagner adina.wagner@t-online.de
    AuthorDate: Mon Nov 11 16:15:08 2019 +0100
    Commit:     Adina Wagner adina.wagner@t-online.de
    CommitDate: Mon Nov 11 16:15:08 2019 +0100

    [DATALAD RUNCMD] Time series extraction from Locus Coeruleus
    === Do not change lines below ===
    {
     "cmd": "singularity exec --bind {pwd} .datalad/environments/nilearn.simg bash..",
     "dsid": "92ea1faa-632a-11e8-af29-a0369f7c647e",
     "inputs": [
      "mri/*.bold.nii.gz",
      ".datalad/environments/nilearn.simg"
     ],
     "outputs": ["sub-*/LC_timeseries_run-*.csv"],
     ...
    }
    ^^^ Do not change lines above ^^^
---
 sub-01/LC_timeseries_run-1.csv | 1 +
...
</code></pre>
        </p>
</section>

<section data-transition="None">
    <h2>data analysis provenance</h2>
                        <p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:130px;margin-bottom:-60px;margin-left:750px">
                        Record code execution together <br> with
                        input-data, output files and software
                        environment in the
                        execution-command
                        </p>
    <p style="z-index: -1">
<pre><code class="bash" style="max-height:none" data-line-numbers="1-6">$ datalad containers-run \
  --message "Time series extraction from Locus Coeruleus"
  --container-name nilearn \
  --input 'mri/*_bold.nii' \
  --output 'sub-*/LC_timeseries_run-*.csv' \
  "python3 code/extract_lc_timeseries.py"

-- Git commit --
    commit 5a7565a640ff6de67e07292a26bf272f1ee4b00e
    Author:     Adina Wagner adina.wagner@t-online.de
    AuthorDate: Mon Nov 11 16:15:08 2019 +0100
    Commit:     Adina Wagner adina.wagner@t-online.de
    CommitDate: Mon Nov 11 16:15:08 2019 +0100

    [DATALAD RUNCMD] Time series extraction from Locus Coeruleus
    === Do not change lines below ===
    {
     "cmd": "singularity exec --bind {pwd} .datalad/environments/nilearn.simg bash..",
     "dsid": "92ea1faa-632a-11e8-af29-a0369f7c647e",
     "inputs": [
      "mri/*.bold.nii.gz",
      ".datalad/environments/nilearn.simg"
     ],
     "outputs": ["sub-*/LC_timeseries_run-*.csv"],
     ...
    }
    ^^^ Do not change lines above ^^^
---
 sub-01/LC_timeseries_run-1.csv | 1 +
...
</code></pre>
    </p>

</section>

<section data-transition="None">
    <h2>data analysis provenance</h2>
                        <p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:60px;margin-bottom:-60px;margin-left:200px">
                        Result: machine readable record about which data, code, and <br>
                        software produced a result how, when, and why.
                        </p>
    <p style="z-index: -1">
<pre><code class="bash" style="max-height:none" data-line-numbers="8-30">$ datalad containers-run \
  --message "Time series extraction from Locus Coeruleus"
  --container-name nilearn \
  --input 'mri/*_bold.nii' \
  --output 'sub-*/LC_timeseries_run-*.csv' \
  "python3 code/extract_lc_timeseries.py"

-- Git commit --
    commit 5a7565a640ff6de67e07292a26bf272f1ee4b00e
    Author:     Adina Wagner adina.wagner@t-online.de
    AuthorDate: Mon Nov 11 16:15:08 2019 +0100
    Commit:     Adina Wagner adina.wagner@t-online.de
    CommitDate: Mon Nov 11 16:15:08 2019 +0100

    [DATALAD RUNCMD] Time series extraction from Locus Coeruleus
    === Do not change lines below ===
    {
     "cmd": "singularity exec --bind {pwd} .datalad/environments/nilearn.simg bash..",
     "dsid": "92ea1faa-632a-11e8-af29-a0369f7c647e",
     "inputs": [
      "mri/*.bold.nii.gz",
      ".datalad/environments/nilearn.simg"
     ],
     "outputs": ["sub-*/LC_timeseries_run-*.csv"],
     ...
    }
    ^^^ Do not change lines above ^^^
---
 sub-01/LC_timeseries_run-1.csv | 1 +
...
</code></pre>
    </p>
</section>

<section data-transition="None">
    <h2>data analysis provenance</h2>
                        <p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:60px;margin-bottom:-60px;margin-left:350px">
                        Use the unique identifier of the execution record
                        </p>
    <p style="z-index: -1">
    <pre><code class="bash" style="max-height:none" data-line-numbers="1">$ datalad rerun 5a7565a640ff6de67
[INFO   ] run commit 5a7565a640ff6de67; (Time series extraction from Locus Coeruleus)
[INFO   ] Making sure inputs are available (this may take some time)
get(ok): mri/sub-01_bold.nii (file)
get(ok): mri/sub-02_bold.nii (file)
        [...]
[INFO   ] == Command start (output follows) =====
[INFO   ] == Command exit (modification check follows) =====
add(ok): sub-01/LC_timeseries_run-*.csv(file)
add(ok): sub-02/LC_timeseries_run-*.csv (file)
        [...]
action summary:
  add (ok: 30)
  get (ok: 30)
  save (ok: 2)
  unlock (ok: 30)
    </code></pre>
    </p>
</section>

<section data-transition="None">
    <h2>data analysis provenance</h2>
                    <p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:400px;margin-bottom:-60px;margin-left:350px">
                        ... to have a machine recompute and verify past work
                        </p>
    <p style="z-index: -1">
    <pre><code class="bash" style="max-height:none" data-line-numbers="2-16">$ datalad rerun 5a7565a640ff6de67
[INFO   ] run commit 5a7565a640ff6de67; (Time series extraction from Locus Coeruleus)
[INFO   ] Making sure inputs are available (this may take some time)
get(ok): mri/sub-01_bold.nii (file)
get(ok): mri/sub-02_bold.nii (file)
        [...]
[INFO   ] == Command start (output follows) =====
[INFO   ] == Command exit (modification check follows) =====
add(ok): sub-01/LC_timeseries_run-*.csv(file)
add(ok): sub-02/LC_timeseries_run-*.csv (file)
        [...]
action summary:
  add (ok: 30)
  get (ok: 30)
  save (ok: 2)
  unlock (ok: 30)
    </code></pre>
</section>
</section>


<section>
    <section>
    <h2>DataLad for scientific workflows?</h2>
    <dl>
        <dt class="fragment fade-in-then-semi-out" data-fragment-index="1">Scientific building blocks are not static.</dt>
        <dd class="fragment fade-in-then-semi-out" data-fragment-index="2">Version control beyond text</dd>
        <dt class="fragment fade-in-then-semi-out" data-fragment-index="3">Science is build from modular units.</dt>
        <dd class="fragment fade-in-then-semi-out" data-fragment-index="4">Nesting</dd>
        <dt class="fragment fade-in-then-semi-out" data-fragment-index="5">Science is exploratory, iterative, multi-stepped, and complex.</dt>
        <dd class="fragment fade-in-then-semi-out" data-fragment-index="6">Provenance</dd>
        <dt class="fragment fade-in-then-semi-out" data-fragment-index="7">Science is collaborative.</dt>
        <dd class="fragment fade-in-then-semi-out" data-fragment-index="8">Transport logistics</dd>
    </dl>
</section>

        <section data-transition="None">
    <h2>Research data management is tied to reproducibility</h2>
    <img src="../pics/fragile.png" height="800">
    <imgcredit>Based on <a href="https://xkcd.com/2347/" target="_blank">
        xkcd.com/2347/</a> (CC-BY)</imgcredit>
          <small><a href="https://www.youtube.com/watch?v=nTVcMDVlyOI" target="_blank">
              Reproducibility Management in Neuroscience -
              Specific Issues and Solutions</a>
              (<a href="https://doi.org/10.5281/zenodo.4285927" target="_blank">DOI 10.5281/zenodo.4285927</a>) </small>
</section>
</section>

<section>
<section data-markdown data-transition="None"><script type="text/template">
## FAIRly big: Scaling up

Objective: Process the UK Biobank (imaging data)
![](../pics/biobank_website.png)<!-- .element: height="400" -->

- 76 TB in 43 million files in total
- 42,715 participants contributed personal health data
- Strict DUA
- Custom binary-only downloader
- Most data records offered as (unversioned) ZIP files
</script></section>

<section data-markdown data-transition="None"><script type="text/template">
## Challenges

- Process data such that
  - Results are computationally reproducible (without the original compute infrastructure)
  - There is complete linkage from results to an individual data record download
  - It scales with the amount of available compute resources

- Data processing pipeline
  - Compiled MATLAB blob
  - 1h processing time per image, with 41k images to process
  - 1.2 M output files (30 output files per input file)
  - 1.2 TB total size of outputs
</script></section>

<section data-transition="None">
    <h2> FAIRly big setup</h2>
<img src="../pics/fairlybig_ukbsetup.png" width="1200" style="margin-top:-35px;margin-bottom:-30px">

    <ul style="font-size:30px">
        <strong>Exhaustive tracking</strong>
        <li><a href="https://github.com/datalad/datalad-ukbiobank" target="_blank">datalad-ukbiobank</a>
extension downloads, transforms & track the evolution of the complete data release
            in DataLad datasets
</li>
        <li>Native and BIDSified data layout (at no additional disk space usage)</li>
        <li>Structured in 42k individual datasets, combined to one superdataset</li>
        <li>Containerized pipeline in a software container</li>
        <li>Link input data & computational pipeline as dependencies</li>
    </ul>
<br><br>
<small><a href="https://www.nature.com/articles/s41597-022-01163-2" target="_blank">
    Wagner, Waite, Wierzba et al. (2021). FAIRly big: A framework for computationally reproducible processing of large-scale data.</a>
</small>
</section>

<section  data-transition="None">
    <h2>FAIRly big workflow</h2>
    <div class="r-stack">
<img class="fragment fade-out" src="../pics/fairlybig_workflow.png" width="1200" style="margin-top:-35px;margin-bottom:-30px">
<img src="../pics/htcondor.svg" class="fragment fade-in">
    </div>
        <br>
    <ul style="font-size:30px">
        <strong>portability</strong>
    <li>Parallel processing: 1 job = 1 subject
        (number of concurrent jobs capped at the capacity of the compute cluster)
    </li>
    <li>Each job is computed in a ephemeral (short-lived) dataset clone, results are pushed back:
        Ensure exhaustive tracking &
        portability during computation</li>
    <li>Content-agnostic persistent (encrypted) storage (minimizing storage and inodes)</li>
    <li>Common data representation in secure environments</li>
</ul>
    <br><br>
<small><a href="https://www.nature.com/articles/s41597-022-01163-2" target="_blank">
    Wagner, Waite, Wierzba et al. (2021). FAIRly big: A framework for computationally reproducible processing of large-scale data.</a>
</small></section>


<section data-transition="None">
    <h2>FAIRly big provenance capture</h2>
<img src="../pics/fairlybig_prov.png" width="1200" style="margin-top:-35px;margin-bottom:-30px">
<br><br>
    <ul style="font-size:30px">
        <strong>Provenance</strong>
    <li>Every single pipeline execution is tracked</li>
    <li>Execution in ephemeral workspaces ensures results
        individually reproducible without HPC access</li>
</ul>
<br><br>
<small><a href="https://www.nature.com/articles/s41597-022-01163-2" target="_blank">
    Wagner, Waite, Wierzba et al. (2021). FAIRly big: A framework for computationally reproducible processing of large-scale data.</a>
</small></section>

<section data-markdown><script type="text/template">
## FAIRly big movie

<iframe width="1120" height="630" src="https://www.youtube-nocookie.com/embed/UsW6xN2f2jc?start=17" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>

- Two computations on clusters of different scale (small cluster, supercomputer). Full video: https://youtube.com/datalad
- Two full (re-)computations, programmatically comparable, verifiable, reproducible -- on any system with data access
</script></section>
</section>

<section>

    <section>
        <h2>Current and future developments</h2>
    </section>
    <!-- I've shown you this slide already - data changes, often due to mishaps from scientists.
    But sometimes, its also political --->
<section data-transition="None">

     <h3>The building blocks of a scientific result are <br>
         ... sometimes unreliable or threatened</h3>
            <table>
                <tr>
                    <div class="r-stack">
                        <p class="fragment fade-in-then-out" data-fragment-index="2">Mar 2019</p>
                        <!--<p class="fragment fade-in-then-out" data-fragment-index="3">Spring 2019</p>
                        <p class="fragment fade-in-then-out" data-fragment-index="4">July 2019</p>
                        <p class="fragment fade-in-then-out" data-fragment-index="5">Dec 2019</p>-->
                        <p class="fragment fade-in-then-out" data-fragment-index="3">Mar 2025 <br>
                       <small> <a href="https://www.404media.co/nih-archives-repositories-marked-for-review-for-potential-modification/" target="_blank">www.404media.co/nih-archives-repositories-marked-for-review-for-potential-modification</a> </small></p>
                    </div>
                </tr>
                <tr>
                        <div class="r-stack">
                            <img class="fragment fade-out" data-fragment-index="2" src="../pics/phd052810s.png" height="700">
                            <img class="fragment fade-in-then-out" data-fragment-index="2" src="../pics/abcd_data_issues1.png">
                           <!-- <img class="fragment fade-in-then-out" data-fragment-index="3" src="../pics/abcd_data_issues2.png">
                            <img class="fragment fade-in-then-out" data-fragment-index="4" src="../pics/abcd_data_issues3.png">
                            <img class="fragment fade-in-then-out" data-fragment-index="5" src="../pics/abcd_data_issues4.png">
                            <img class="fragment fade-in-then-out" data-fragment-index="6" src="../pics/abcd_data_issues4.1.png">-->
                            <img class="fragment fade-in-then-out" data-fragment-index="3" src="../pics/nda_review.png">
                        </div>
                    <imgcredit class="fragment fade-out" data-fragment-index="1">Piled Higher and Deeper
                        <a href="https://phdcomics.com/comics/archive_print.php?comicid=1323" target="_blank">
                            1323
                        </a> </imgcredit></td>

                </tr>
            </table>

    <div class="r-stack">
                                <p style="vertical-align:middle" class="fragment fade-in" data-fragment-index="3"><u>Data</u> changes <br>
                        <small>Due to presidential executive orders<br>
                             to remove files mentioning "gender"<br>
                            </small></p>
  </div>
</section>


    <!-- Previously, we told PhD students that decentralization saves them time
    when -->
<section>
    <h2>Freedom? Chose Decentralization</h2>
    <ul>
        <li>Infrastructure is ephemeral:</li>
        <ul>
            <li>Change of institutional contracts</li>
            <li>Change of affiliations</li>
            <li>Geopolitical developments?</li>
        </ul>
        <li>DataLad datasets are portable</li>
        <ul>
            <li>Effortless migrations to different Git or data hosting</li>
            <li>Versioning allows for integrity checks</li>
        </ul>
    </ul>
    <br><br><br>
        <p >Delineation and advantages of decentral versus central RDM:<br><a href="https://doi.org/10.1515/nf-2020-0037" target="_blank">
             Hanke et al., (2021). In defense of decentralized research data management</a></>

    </section>

<section>
    <h2>Going self-hosted with forgejo-aneksajo</h2>
    <ul>
        <li>Forgejo (<a href="https://forgejo.org" target="_blank">forgejo.org</a>): Fork of Gitea</li>
        <li class="fragment fade-in"><a href="https://codeberg.org/forgejo-aneksajo/forgejo-aneksajo" target="_blank">
            Forgejo-aneksajo</a>: Forgejo with git-annex support</li>
    </ul>
    <div class="r-stack">
        <img src="../pics/datalad-hub-frontpage.png">
        <img class="fragment fade in" src="../pics/naturalistic-imaging-hub.png">
    </div>
</section>

<section data-markdown data-transition="none"><script type="text/template">
### Full-stack RDM for independent, interoperable collaborators
![](../pics/forgejo.webp)

![Consortium RDM setup](../pics/consortium_rdm_setup.svg)<!-- .element: width="400" style="margin-top:-20px;margin-bottom:-10px" -->


scale-free organization: consortium, institution, lab, researcher
<div style="float:left;max-width:50%">
<ul>
<li>maximum contributor benefit</li>
<li>self-hostable, independently governed solutions, e.g.,
    <a href="https://atris.fz-juelich.de" target="_blank">atris.fz-juelich.de</a>, <a href="https://hub.trr379.de" target="_blank">hub.trr379.de</a> </li>
</ul>
</div>
<div style="float:left;max-width:50%">
<ul>
<li>minimum contributor cost</li>
<li>self-contained contributor scopes, not inheriting complexity of others</li>
</ul>
</div>
</script></section>

    <section>
        <h2>Development Roadmap</h2>
        <img src="../pics/roadmap_2025.png">
    </section>


                <section style="font-size:45px" data-transition="None" data-background-image="../pics/distribits-teaser-2025.svg"
                         data-background-size="1800px" data-background-opacity="0.2">
                     <h1>Join us!</h1>
                    <ul>
                        <strong>Distribits 2025</strong>
                        <li>International conference on technologies for distributed data management</li>
                        <li>2 day conference plus single-day Hackathon </li>
                        <li>@ Haus der Universität Düsseldorf</li>
                        <li>Registration open until May 1st</li>
                    </ul>
                    <br><br><br><br>
                    <h2><a href="https://distribits.live" target="_blank">distribits.live</a> </h2>
                </section>
</section>


                <section>


<section>
  <h2>DataLad contact and more information</h2>
  <table>
  <tr><td>Website + Demos</td>
  <td><a href="http://datalad.org">http://datalad.org</a></td>
  </tr><tr><td>Documentation</td>
  <td><a href="http://handbook.datalad.org">http://handbook.datalad.org</a></td>
  </tr><tr><td>Talks and tutorials</td>
  <td><a href="https://youtube.com/datalad">https://youtube.com/datalad</a></td>
  </tr><tr><td>Development</td>
  <td><a href="http://github.com/datalad">http://github.com/datalad</a></td>
  </tr><tr><td>Support</td>
  <td><a href="https://matrix.to/#/#datalad:matrix.org">https://matrix.to/#/#datalad:matrix.org</a></td>
  </tr><tr><td>Open data</td>
  <td><a href="http://datasets.datalad.org">http://datasets.datalad.org</a></td>
  </tr>
  </tr><tr><td>Mastodon</td>
  <td>@datalad@fosstodon.org</td>
  </tr>
  </table>
</section>
                    <section data-markdown><script type="text/template">
## Extensive documentation and training materials
![](../pics/cover.svg)<!-- .element: width="700" style="margin-top:-20px;margin-bottom:-10px" -->

https://handbook.datalad.org (or ISBN 979-8857037973)

- **educational materials** on technologies &mdash; **targeting researchers**, not developers (executable paper, student surpervisor workflow,
  ...)
- handbook on concepts, workflows, and use cases
- **weekly public (virtual) office hour**

Note:
RDM Education is key. Handbook helps people be more productive, yielding more FAIR resources as an outcome, but not as the main goal.
</script></section>

                </section>

<section>
    <section>
        <h1>Thanks!</h1>
        <img src="../pics/qr_nhr.png" height="400px">
    </section>
</section>
<section>
    <!-- BACKUP -->
    <section data-markdown><script type="text/template">
## Talk is cheap, show me the code: Git vs. DataLad

<iframe width="1120" height="630" src="https://www.youtube-nocookie.com/embed/Yrg6DgOcbPE" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>

https://www.youtube.com/watch?v=Yrg6DgOcbPE

<aside class="notes">
- show git limits: commit a change in a 3rd-level submodule
- show annex limits: get file in a subdataset
- reveal: datalad makes repo-boundaries vanish -- show save -r
</aside>
</script></section>
</section>


			</div>
		</div>

		<script src="../reveal.js/dist/reveal.js"></script>
		<script src="../reveal.js/plugin/notes/notes.js"></script>
		<script src="../reveal.js/plugin/markdown/markdown.js"></script>
		<script src="../reveal.js/plugin/highlight/highlight.js"></script>
		<script>
			// More info about initialization & config:
			// - https://revealjs.com/initialization/
			// - https://revealjs.com/config/
			Reveal.initialize({
				hash: true,
				// The "normal" size of the presentation, aspect ratio will be preserved
				// when the presentation is scaled to fit different resolutions. Can be
				// specified using percentage units.
				width: 1280,
				height: 960,
				// Factor of the display size that should remain empty around the content
				margin: 0.3,
				// Bounds for smallest/largest possible scale to apply to content
				minScale: 0.2,
				maxScale: 1.0,

				controls: true,
				progress: true,
				history: true,
				center: true,
				slideNumber: 'c',
				pdfSeparateFragments: true,
				pdfMaxPagesPerSlide: 1,
				pdfPageHeightOffset: -1,
				transition: 'slide', // none/fade/slide/convex/concave/zoom
				// Learn about plugins: https://revealjs.com/plugins/
				plugins: [ RevealMarkdown, RevealHighlight, RevealNotes ]
			});
		</script>
	</body>
</html>