Merge pull request #30 from UCL-ARC/paul/all-subjects

Make `subjects.txt` optional
UCL-ARC · Jan 27, 2025 · 23e4ea4 · 23e4ea4
2 parents dabeef4 + ce01987
commit 23e4ea4
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 53 deletions.
diff --git a/README.md b/README.md
@@ -30,11 +30,9 @@ Setting up and running the pipeline requires the following steps, which are expl
 flowchart TD
     installation("`Install prerequisites`")
     convert("`Ensure data is in BIDS format`")
-    subjects("`Create a subjects.txt input file`")
     run("`Run the container with Docker / Apptainer`")
     installation --> convert
-    convert --> subjects
-    subjects --> run
+    convert --> run
 ```
 
 ## 1. Install prerequisites
@@ -76,32 +74,7 @@ data
 │           └───sub-1_ses-1_FLAIR.nii.gz
 ```
 
-## 3. Create `subjects.txt` file
-
-Inside your top-level BIDS directory (e.g. `data` in the above example structure), create a `subjects.txt` file that
-contains subject identifiers (one per line).
-
-The subject identifies **must** match the names of the corresponding subject folders, e.g. `sub-1`, `sub-2`.
-
-Your final file structure should look like below (for two example subject ids):
-
-```bash
-data
-├── sub-1
-│   └── ses-1
-│       └── anat
-│           ├── sub-1_ses-1_T1w.nii.gz
-│           └── sub-1_ses-1_FLAIR.nii.gz
-│
-├── sub-2
-│   └── ses-1
-│       └── anat
-│           ├── sub-1_ses-1_T1w.nii.gz
-│           └── sub-1_ses-1_FLAIR.nii.gz
-└── subjects.txt
-```
-
-## 4. Run the container
+## 3. Run the container
 
 >[!IMPORTANT]
 > When running the container, make sure you run the command from the top-level directory
@@ -155,25 +128,33 @@ image.
 
 ### Options
 
-- `-n` : the number of jobs to run in parallel.
+- `-n` : the number of jobs to run in parallel. Defaults to 1. See also potentials issues
+  of [increased memory usage](#tensorflow-memory-usage) when running in parallel.
 
-  By default (without `-n`), the pipeline will process your subjects sequentially on 1 core. With `-n` they will be
-  processed in parallel with `n` jobs. For example:
+- `-o` : overwrite existing intermediate files
 
-  ```bash
-  # Run with 5 jobs
-  -n 5
-  ```
+  When this flag is set, the pipeline will run all steps of the pipeline, overwriting any previous output for a given
+  session.
 
-  A good default value is the number of cores on your system, but be
-  [wary of increased memory usage](#tensorflow-memory-usage).
+  When this flag is not set, the pipeline will re-use any existing output files, skipping steps that have previously been
+  completed. This is useful if, for example, the pipeline fails at a late stage and you want to run it again, without
+  having to re-run time-consuming earlier steps. This is the default behaviour.
 
-- `-o` : whether to overwrite existing output files
+- `f` : Path to a file containing a list of subjects to target.
 
-  By default (without `-o`), the pipeline will try to re-use any existing output files, skipping steps that are already
-  complete. This is useful if, for example, the pipeline fails at a late stage and you want to run it again, without
-  having to re-run time-consuming earlier steps. With `-o` the pipeline will run all steps again and ensure any previous
-  output is overwritten.
+  The path must be relative to your data directory, and the file must be within the `data/` directory or one of its
+  sub-directories. The file must contain one subject per line, e.g.
+
+  ```bash filename="subjects.txt"
+  sub-1
+  sub-2
+  sub-3
+  ```
+
+- `s` : Comma-separated list of subjects to include in the analysis, e.g. `-s sub-1,sub-2,sub-3`
+
+> [!NOTE]
+> If both `-f` and `-s` are omitted, the pipeline will be run on all subjects.
 
 ## Pipeline output
 
@@ -182,11 +163,12 @@ image.
 After running your analysis, your data directory should have the following structure:
 
 ```bash
-bids-data
+data
 ├── dataset_description.json
 ├── derivatives
 │   └── enigma-pd-wml
 │       ├── enigma-pd-wml.log
+│       ├── enigma-pd-wml-results.zip
 │       └── sub-1
 │           ├── ses-1
 │           │   ├── input/
@@ -207,7 +189,6 @@ bids-data
 │       └── anat
 │           ├── sub-1_ses-2_FLAIR.nii.gz
 │           └── sub-1_ses-2_T1w.nii.gz
-└── subjects.txt
 ```
 
 #### Session-level zip files

diff --git a/analysis_script.sh b/analysis_script.sh
@@ -379,8 +379,10 @@ function runAnalysis (){
 
 function parseArguments() {
   n=1
+  subjects_file=""
+  subjects=()
   export overwrite=false
-  while getopts "n:o" opt; do
+  while getopts "n:of:s:" opt; do
     case ${opt} in
       n)
         n=${OPTARG}
@@ -389,6 +391,13 @@ function parseArguments() {
         echo "overwrite option enabled"
         overwrite=true
         ;;
+      f)
+        subjects_file="${data_path}/${OPTARG}"
+        ;;
+      s)
+        IFS=',' read -r -a temp_subjects <<< "${OPTARG}"
+        subjects+=("${temp_subjects[@]}")
+        ;;
       ?)
         echo "Invalid option: -${OPTARG}."
         exit 1
@@ -406,18 +415,33 @@ function setupRunAnalysis(){
 
   parseArguments "$@"
 
-  # Get the list of sessions for each subject
-  subjects_list=${data_path}/subjects.txt
-  echo "Obtaining list of subjects and sessions based on subjects in ${subjects_list}"
+  # Include subjects from file if provided
+  if [[ -n "$subjects_file" ]]; then
+    echo "Using subjects file: ${subjects_file}"
+    while IFS=$'\n' read -r subject; do
+      subjects+=("$subject")
+    done < "$subjects_file"
+  fi
+
+  if [[ ${#subjects[@]} -gt 0 ]]; then
+    # remove duplicates (subjects in both `subjects_file` and `subjects_list`)
+    subjects=($(echo "${subjects[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' '))
+  else
+    echo "No subjects file or list provided. Running analysis on all subjects."
+    # shellcheck disable=SC2011
+    subjects=($(ls -d ${data_path}/sub-* | xargs -n 1 basename))
+  fi
+
+  # Get sessions for selected subjects
   subjects_sessions=()
-  while IFS=$'\n' read -r subject; do
-    # shellcheck disable=SC2038
-    sessions=$(find ${data_path}/${subject}/ses-*/anat/${subject}_ses-*_T1w.nii.gz | xargs -n 1 dirname | xargs -n 1 dirname | xargs -n 1 basename)  #
+  for subject in "${subjects[@]}"; do
+  # shellcheck disable=SC2038
+    sessions=$(find ${data_path}/${subject}/ses-*/anat/${subject}_ses-*_T1w.nii.gz | xargs -n 1 dirname | xargs -n 1 dirname | xargs -n 1 basename)
     for session in $sessions; do
       subjects_sessions+=("${subject} ${session}")
       mkdir -p ${data_path}/derivatives/enigma-pd-wml/${subject}/${session}
     done
-  done < $subjects_list
+  done
 
   # Run the analysis
   if [[ $n -eq 1 ]]