Merge pull request #5 from Neuroprosthetics-Lab/auto-download
Added a script to auto-download the data from Dryad
This commit is contained in:
@@ -25,7 +25,7 @@ The code is organized into five main directories: `utils`, `analyses`, `data`, `
|
||||
|
||||
## Data
|
||||
### Data Overview
|
||||
The data used in this repository (which can be downloaded from [Dryad](https://datadryad.org/stash/dataset/doi:10.5061/dryad.dncjsxm85)) consists of various datasets for recreating figures and training/evaluating the brain-to-text model:
|
||||
The data used in this repository (which can be downloaded from [Dryad](https://datadryad.org/stash/dataset/doi:10.5061/dryad.dncjsxm85), either manually from the website, or using `download_data.py`) consists of various datasets for recreating figures and training/evaluating the brain-to-text model:
|
||||
- `t15_copyTask.pkl`: This file contains the online Copy Task results required for generating Figure 2.
|
||||
- `t15_personalUse.pkl`: This file contains the Conversation Mode data required for generating Figure 4.
|
||||
- `t15_copyTask_neuralData.zip`: This dataset contains the neural data for the Copy Task.
|
||||
|
111
download_data.py
Normal file
111
download_data.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""
|
||||
Run this file to download data from Dryad and unzip the zip files. Downloaded files end
|
||||
up in this repostitory's data/ directory.
|
||||
|
||||
First create the b2txt25 conda environment. Then in a Terminal, at this repository's
|
||||
top-level directory (nejm-brain-to-text/), run:
|
||||
|
||||
conda activate b2txt25
|
||||
python download_data.py
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import urllib.request
|
||||
import json
|
||||
import zipfile
|
||||
|
||||
|
||||
########################################################################################
|
||||
#
|
||||
# Helpers.
|
||||
#
|
||||
########################################################################################
|
||||
|
||||
|
||||
def display_progress_bar(block_num, block_size, total_size, message=""):
|
||||
""""""
|
||||
bytes_downloaded_so_far = block_num * block_size
|
||||
MB_downloaded_so_far = bytes_downloaded_so_far / 1e6
|
||||
MB_total = total_size / 1e6
|
||||
sys.stdout.write(
|
||||
f"\r{message}\t\t{MB_downloaded_so_far:.1f} MB / {MB_total:.1f} MB"
|
||||
)
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
########################################################################################
|
||||
#
|
||||
# Main function.
|
||||
#
|
||||
########################################################################################
|
||||
|
||||
|
||||
def main():
|
||||
""""""
|
||||
DRYAD_DOI = "10.5061/dryad.dncjsxm85"
|
||||
|
||||
## Make sure the command is being run from the right place and we can see the data/
|
||||
## directory.
|
||||
|
||||
DATA_DIR = "data/"
|
||||
data_dirpath = os.path.abspath(DATA_DIR)
|
||||
assert os.getcwd().endswith(
|
||||
"nejm-brain-to-text"
|
||||
), f"Please run the download command from the nejm-brain-to-text directory (instead of {os.getcwd()})"
|
||||
assert os.path.exists(
|
||||
data_dirpath
|
||||
), "Cannot find the data directory to download into."
|
||||
|
||||
## Get the list of files from the latest version on Dryad.
|
||||
|
||||
DRYAD_ROOT = "https://datadryad.org"
|
||||
urlified_doi = DRYAD_DOI.replace("/", "%2F")
|
||||
|
||||
versions_url = f"{DRYAD_ROOT}/api/v2/datasets/doi:{urlified_doi}/versions"
|
||||
with urllib.request.urlopen(versions_url) as response:
|
||||
versions_info = json.loads(response.read().decode())
|
||||
|
||||
files_url_path = versions_info["_embedded"]["stash:versions"][-1]["_links"][
|
||||
"stash:files"
|
||||
]["href"]
|
||||
files_url = f"{DRYAD_ROOT}{files_url_path}"
|
||||
with urllib.request.urlopen(files_url) as response:
|
||||
files_info = json.loads(response.read().decode())
|
||||
|
||||
file_infos = files_info["_embedded"]["stash:files"]
|
||||
|
||||
## Download each file into the data directory (and unzip for certain files).
|
||||
|
||||
for file_info in file_infos:
|
||||
filename = file_info["path"]
|
||||
|
||||
if filename == "README.md":
|
||||
continue
|
||||
|
||||
download_path = file_info["_links"]["stash:download"]["href"]
|
||||
download_url = f"{DRYAD_ROOT}{download_path}"
|
||||
|
||||
download_to_filepath = os.path.join(data_dirpath, filename)
|
||||
|
||||
urllib.request.urlretrieve(
|
||||
download_url,
|
||||
download_to_filepath,
|
||||
reporthook=lambda *args: display_progress_bar(
|
||||
*args, message=f"Downloading {filename}"
|
||||
),
|
||||
)
|
||||
sys.stdout.write("\n")
|
||||
|
||||
# If this file is a zip file, unzip it into the data directory.
|
||||
|
||||
if file_info["mimeType"] == "application/zip":
|
||||
print(f"Extracting files from {filename} ...")
|
||||
with zipfile.ZipFile(download_to_filepath, "r") as zf:
|
||||
zf.extractall(data_dirpath)
|
||||
|
||||
print(f"\nDownload complete. See data files in {data_dirpath}\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user