Commit 504ad38f authored by Hasan Celik's avatar Hasan Celik
Browse files

init

parents
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
.~
.snake*
GENOMEDIR='/s/genomes/droMel/r6.22/star-indexes'
DATAPATH='/s/project/gagneurlab_shared/chipseq-polycomb/analysis-november-2018/'
NUMTHEADS=48
CHIPSEQFILESFIRSTPAIR=(
# 'Input_esc_0-6hr_S3_R1_001'
# 'Input_WT_0-6hr.conc.R1'
# 'Input_WT_18-24hr_S7_R1_001'
# 'H3K27me3_ChIP_esc_0-6hr_S4_R1_001'
# 'H3K27me3_ChIP_WT_0-6hr.conc.R1'
# 'H3K27me3_ChIP_WT_18-24hr_S5_R1_001'
# 'Input_esc_18-24hr.conc.R1'
# 'Input_WT_0-6hr_S2_R1_001'
# 'H3K27me3_ChIP_esc_18-24hr.conc.R1'
# 'H3K27me3_ChIP_WT_0-6hr_S1_R1_001'
# 'Input_esc_0-6hr.conc.R1'
'Input_esc_18-24hr_S6_R1_001'
'Input_WT_18-24hr.conc.R1'
'H3K27me3_ChIP_esc_0-6hr.conc.R1'
'H3K27me3_ChIP_esc_18-24hr_S8_R1_001'
'H3K27me3_ChIP_WT_18-24hr.conc.R1'
)
CHIPSEQFILESSECONDPAIR=(
# 'Input_esc_0-6hr_S3_R2_001'
# 'Input_WT_0-6hr.conc.R2'
# 'Input_WT_18-24hr_S7_R2_001'
# 'H3K27me3_ChIP_esc_0-6hr_S4_R2_001'
# 'H3K27me3_ChIP_WT_0-6hr.conc.R2'
# 'H3K27me3_ChIP_WT_18-24hr_S5_R2_001'
# 'Input_esc_18-24hr.conc.R2'
# 'Input_WT_0-6hr_S2_R2_001'
# 'H3K27me3_ChIP_esc_18-24hr.conc.R2'
# 'H3K27me3_ChIP_WT_0-6hr_S1_R2_001'
# 'Input_esc_0-6hr.conc.R2'
'Input_esc_18-24hr_S6_R2_001'
'Input_WT_18-24hr.conc.R2'
'H3K27me3_ChIP_esc_0-6hr.conc.R2'
'H3K27me3_ChIP_esc_18-24hr_S8_R2_001'
'H3K27me3_ChIP_WT_18-24hr.conc.R2'
)
CHIPSEQFILESFINAL=(
# 'Input_esc_0-6hr_S3_001'
# 'Input_WT_0-6hr.conc'
# 'Input_WT_18-24hr_S7_001'
# 'H3K27me3_ChIP_esc_0-6hr_S4_001'
# 'H3K27me3_ChIP_WT_0-6hr.conc'
# 'H3K27me3_ChIP_WT_18-24hr_S5_001'
# 'Input_esc_18-24hr.conc'
# 'Input_WT_0-6hr_S2_001'
# 'H3K27me3_ChIP_esc_18-24hr.conc'
# 'H3K27me3_ChIP_WT_0-6hr_S1_001'
# 'Input_esc_0-6hr.conc'
'Input_esc_18-24hr_S6_001'
'Input_WT_18-24hr.conc'
'H3K27me3_ChIP_esc_0-6hr.conc'
'H3K27me3_ChIP_esc_18-24hr_S8_001'
'H3K27me3_ChIP_WT_18-24hr.conc'
)
for ((i=0;i<${#CHIPSEQFILESFIRSTPAIR[@]};++i));
do
CHIPSEQFILE=${CHIPSEQFILESFINAL[i]}
CHIPSEQFP=$DATAPATH${CHIPSEQFILESFIRSTPAIR[i]}'.fastq.gz'
CHIPSEQSP=$DATAPATH${CHIPSEQFILESSECONDPAIR[i]}'.fastq.gz'
CHIPSEQBAM=$CHIPSEQFILE'.bam'
echo $CHIPSEQFILE
STAR --runMode alignReads --genomeDir $GENOMEDIR --readFilesIn $CHIPSEQFP $CHIPSEQSP --readFilesCommand zcat --alignIntronMax 1 --seedSearchStartLmax 30 --outFilterMultimapNmax 1 --outFilterMismatchNmax 1 --runThreadN $NUMTHEADS
samtools view -hb -q 255 -@ $NUMTHEADS Aligned.out.sam | samtools sort -m 32G -@ $NUMTHEADS -o $CHIPSEQBAM -
samtools index $CHIPSEQBAM
mkdir $CHIPSEQFILE
mv ./*.out $CHIPSEQFILE
mv ./*.tab $CHIPSEQFILE
mv ./*.sam $CHIPSEQFILE
mv ./*.bam $CHIPSEQFILE
mv ./*.bam.bai $CHIPSEQFILE
done
data_path = '/s/project/gagneurlab_shared/chipseq-polycomb/'
csv_files = [
'GSM714136_yw_18to24hr_H3K27me3_ChIPSeq_export.txt',
'GSM714137_yw_18to24hr_INPUT_DNA_export.txt',
'GSM714140_esc_18to24hr_H3K27me3_ChIPSeq_export.txt',
'GSM714141_esc_18to24hr_INPUT_DNA_export.txt'
]
for d in csv_files:
with open(data_path + d.replace('.txt', '.fastq'), 'w') as output_file:
with open(data_path + d) as input_file:
for l in input_file:
line = l.strip().split('\t')
line = [line[i] for i in range(3) + range(4, 10)]
title = '%s:%s:%s:%s:%s#%s/%s' % tuple(line[:-2])
seq = line[-2]
qual = line[-1]
fastq_line = '@{title}\n{seq}\n+\n{qual}\n'.format(
title=title, seq=seq, qual=qual)
output_file.write(fastq_line)
GENOMEDIR='/s/genomes/droMel/r6.22/star-indexes'
GENOMEPATH='/s/genomes/droMel/r6.22/dmel-all-chromosome-r6.22.fasta'
GTFPATH='/s/genomes/droMel/r6.22/dmel-all-r6.22.gtf'
NUMTHEADS=48
STAR \
--runMode genomeGenerate \
--genomeDir $GENOMEDIR \
--genomeFastaFiles $GENOMEPATH \
--sjdbGTFfile $GTFPATH \
--runThreadN $NUMTHEADS
This diff is collapsed.
Original
3R:2650000-2680000
3L:12390000-12450000
Mapped to v6
3R:6824278-6854278
3L:12416053-12476053
import os
workdir:
".."
def init():
from settings import FASTA_PATH, GTF_PATH, DATA_DIR, INPUT_DIR, OUTPUT_DIR, fastq_paired_files, fastq_files, count_files
global FASTA_PATH, GTF_PATH, DATA_DIR, INPUT_DIR, OUTPUT_DIR, fastq_paired_files, fastq_files, count_files
init()
INDEXES_DIR = '/s/genomes/droMel/r6.22/star-indexes'
num_threads = 48
rule all:
input:
expand(os.path.join(OUTPUT_DIR, '{fastq_paired}', '{fastq_paired}.bam'),
fastq_paired=fastq_files),
expand(os.path.join(OUTPUT_DIR, '{fastq}', '{fastq}.bam'),
fastq=fastq_files),
expand(os.path.join(OUTPUT_DIR, '{bam_file}', '{bam_file}_counts.txt'),
bam_file=count_files)
rule index:
input:
FASTA_PATH = FASTA_PATH,
GTF_PATH = GTF_PATH
params:
num_threads = num_threads,
INDEXES_DIR = INDEXES_DIR
output:
os.path.join(INDEXES_DIR, 'genomeParameters.txt')
shell: "STAR --runMode genomeGenerate --genomeDir {params.INDEXES_DIR} --genomeFastaFiles {input.FASTA_PATH} --sjdbGTFfile {input.GTF_PATH} --runThreadN {params.num_threads}"
rule align_paired:
input:
os.path.join(INDEXES_DIR, 'genomeParameters.txt'),
INDEXES_DIR = INDEXES_DIR,
read1 = os.path.join(
INPUT_DIR, '{fastq_paired}_read1.fastq.gz'),
read2 = os.path.join(
INPUT_DIR, '{fastq_paired}_read2.fastq.gz')
params:
align_intron_max = 1,
seed_search_start_lmax = 30,
out_filter_multiple_map_nmax = 1,
out_filter_mismatch_nmax = 1,
num_threads = num_threads,
output:
bam_file = os.path.join(
OUTPUT_DIR, '{fastq_paired}', '{fastq_paired}.bam')
shell:
"""
STAR --runMode alignReads --genomeDir {input.INDEXES_DIR} --readFilesIn {input.read1} {input.read2} --readFilesCommand zcat --alignIntronMax {params.align_intron_max} --seedSearchStartLmax {params.seed_search_start_lmax} --outFilterMultimapNmax {params.out_filter_multiple_map_nmax} --outFilterMismatchNmax {params.out_filter_mismatch_nmax} --outFileNamePrefix {output[0]} --runThreadN {params.num_threads}
samtools view -hb -q 255 -@ {params.num_threads} {output[0]}Aligned.out.sam | samtools sort -m 2G -@ {params.num_threads} -o {output.bam_file} -
samtools index {params.bam_file}
"""
rule align_single:
input:
os.path.join(INDEXES_DIR, 'genomeParameters.txt'),
INDEXES_DIR = INDEXES_DIR,
read = os.path.join(INPUT_DIR, '{fastq}.fastq.gz')
params:
align_intron_max = 1,
seed_search_start_lmax = 30,
out_filter_multiple_map_nmax = 1,
out_filter_mismatch_nmax = 1,
num_threads = num_threads
output:
bam_file = os.path.join(OUTPUT_DIR, '{fastq}', '{fastq}.bam')
shell:
"""
STAR --runMode alignReads --genomeDir {input.INDEXES_DIR} --readFilesIn {input.read} --readFilesCommand zcat --alignIntronMax {params.align_intron_max} --seedSearchStartLmax {params.seed_search_start_lmax} --outFilterMultimapNmax {params.out_filter_multiple_map_nmax} --outFilterMismatchNmax {params.out_filter_mismatch_nmax} --outFileNamePrefix {output[0]} --runThreadN {params.num_threads}
samtools view -hb -q 255 -@ {params.num_threads} {output[0]}Aligned.out.sam | samtools sort -m 2G -@ {params.num_threads} -o {output.bam_file} -
samtools index {params.bam_file}
"""
rule count_rna_reads:
input:
GTF_PATH = GTF_PATH,
bam_file = os.path.join(OUTPUT_DIR, '{bam_file}', '{bam_file}.bam')
params:
num_threads = 5
output:
os.path.join(OUTPUT_DIR, '{bam_file}', '{bam_file}_counts.txt')
shell:
"""
featureCounts -T {params.num_threads} -p -t exon -g gene_id -a {input.GTF_PATH} -o {output} {input.bam_file}
"""
import os
FASTA_PATH = '/s/genomes/droMel/r6.22/dmel-all-chromosome-r6.22.fasta'
GTF_PATH = '/s/genomes/droMel/r6.22/dmel-all-r6.22.gtf'
DATA_DIR = '/s/project/gagneurlab_shared/chipseq-polycomb'
INPUT_DIR = os.path.join(DATA_DIR, 'input')
OUTPUT_DIR = os.path.join(DATA_DIR, 'output')
fastq_paired_files = [
'dm_embryo_embryo2-4_1',
'dm_embryo_embryo2-4_2',
'dm_embryo_embryo14-16_1',
'dm_embryo_embryo14-16_2',
]
fastq_files = [
'H3K36me2_W_14-16_hr_chipseq_rep1',
'H3K36me2_W_14-16_hr_chipseq_rep2',
'H3K36me2_W_14-16_hr_input_rep1',
'H3K36me2_W_14-16_hr_input_rep2',
'H3K36me3_14-16_hr_chipseq_rep1',
'H3K36me3_14-16_hr_chipseq_rep2',
'H3K36me3_14-16_hr_input_rep1',
'H3K36me3_14-16_hr_input_rep2',
'H3K4me3_14-16_hr_chipseq_rep1',
'H3K4me3_14-16_hr_chipseq_rep2',
'H3K4me3_14-16_hr_input_rep1',
'H3K4me3_14-16_hr_input_rep2'
]
count_files = [
'dm_embryo_embryo2-4_1',
'dm_embryo_embryo2-4_2',
'dm_embryo_embryo14-16_1',
'dm_embryo_embryo14-16_2'
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment