Notebooks for Preprocessing GSE Data
We provide a series of notebooks to help you preprocess your data. These notebooks are designed to be easy to use on public GSE datasets.
The notebooks are designed to locate in raw_data folder. You can choose to run the notebooks according to the types of data you have:
Batched data with 10X format: batch_10X Example Notebook
Batched data with csv.gz format: batch_csv Example Notebook
if txt.gz format, change delimiter to \t
Single data with 10X format: single_10X Example Notebook
Single data with txt.gz format: single_txt Example Notebook
if csv.gz format, change delimiter to ,
Note some older 10X data using genes.tsv instead of features.tsv, you should add Gene Expression to the column names to make
it compatible with the latest version of sc.read_10x_mtx().
for sample in sample_list:
os.mkdir(os.path.join(dir_path, sample))
for sample_file in file_list:
if sample in sample_file:
os.system(f'mv {os.path.join(dir_path, sample_file)} {os.path.join(dir_path, sample)}')
if 'barcodes' in sample_file:
os.system(f'mv {os.path.join(dir_path, sample, sample_file)} {os.path.join(dir_path, sample, "barcodes.tsv.gz")}')
elif 'genes' in sample_file:
os.system(f'gunzip {os.path.join(dir_path, sample, sample_file)}')
os.system(f"sed -i 's/$/\tGene Expression/' {os.path.join(dir_path, sample, sample_file.split('.gz')[0])}")
os.system(f"gzip {os.path.join(dir_path, sample, sample_file.split('.gz')[0])}")
os.system(f'mv {os.path.join(dir_path, sample, sample_file)} {os.path.join(dir_path, sample, "features.tsv.gz")}')
elif 'matrix' in sample_file:
os.system(f'mv {os.path.join(dir_path, sample, sample_file)} {os.path.join(dir_path, sample, "matrix.mtx.gz")}')