DepMap Analysis

In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
import os, sys, re, pickle, wget, math
from intermine.webservice import Service
from IPython.display import Markdown, display
import matplotlib.pyplot as plt
import scipy.stats as stats

# Load data direct from source
class A(object):
    data = {}
    url = {}
    url['expression'] = 'https://ndownloader.figshare.com/files/16757690'
    url['sample_info'] = 'https://ndownloader.figshare.com/files/16757723'
    url['copy_number'] = 'https://ndownloader.figshare.com/files/17857886'
    url['mutations'] = 'https://ndownloader.figshare.com/files/16757702'
    url['achilles_crispr'] = 'https://ndownloader.figshare.com/files/16757666'
    url['d2_rnai'] = 'https://ndownloader.figshare.com/files/13515395'
    url['sensitivity'] = 'https://ndownloader.figshare.com/files/17008628'
    url['mfr'] = 'http://bmbl.sdstate.edu/MFR/data/resource%20data/tr_dv_ts.dataset.zip'
    url['reactome'] = 'https://reactome.org/download/current/NCBI2Reactome.txt'
    summary = '../README.md'
    
# Wipe the slate clean on the analysis summary
os.system("echo '' > "+A.summary)
    
def printmd(string):
    """Prints formatted markdown text"""
    display(Markdown(string))
    
def report(text,silent=False):
    """Print markdown in this notebook and saves markdown-only summary in a file"""
    if not silent:
        printmd(text)
    
    f = open(A.summary,'a')
    f.write(text+'\n')
    
def get_gene_descriptions():
    """Load gene names and descriptions from humanmine (http://www.humanmine.org),
    an integrated database of human genome information.  Use cached data if available"""
    
    archive = 'data/gene_info.p'
    if os.path.exists(archive):
        df = pd.read_pickle(archive)
        return df
    
    service = Service("https://www.humanmine.org/humanmine/service")
    query = service.new_query("Gene")
    cols = ["primaryIdentifier", "symbol", "briefDescription", "description","proteins.uniprotAccession"]
    query.add_view(*cols)
    query.add_constraint("organism.taxonId", "=", "9606", code = "A")    
    df_rows = []

    for row in query.rows():
        df_rows.append(
            [row["primaryIdentifier"], 
             row["symbol"], 
             row["briefDescription"], 
             row["description"],
             row["proteins.uniprotAccession"]
            ])

    df = pd.DataFrame(data=df_rows,columns=cols)
    df.to_pickle(archive)
    return df
    
def get_data(key):
    """Load input data.
    Arguments: key for the data source (eg: expression, sample_info...)
    1) If the data is cached on the filesystem, load and return the dataframe
    2) Otherwise, load the data from the source URL, cache, return the dataframe
    """
    
    data_cache = 'data/'+key+'.p'
    if os.path.exists(data_cache):
        df = pd.read_pickle(data_cache)
    else:
        print("Downloading",key,"from source")
        df = pd.read_csv(A.url[key],low_memory=False,index_col=0)
        df.to_pickle(data_cache)
        
    A.data[key] = df    
    return df

def ncbi_gene_ids(genes):
    """Converts gene name "symbol (ncbi_id)"
    to integer NCBI ID.  Also catch missing associations
    for NCBI gene ID/symbol
    """
    ncbi_cols = []
    for g in genes:
        match = re.search('(\S+)\s+\((\d+)\)',g)
        if match:
            s = match.group(1)
            g = match.group(2)
            if not ncbi2symbol.get(int(g)):
                ncbi2symbol[int(g)] = s
                symbol2ncbi[s] = int(g)
        else:
            print("No match",g)
        ncbi_cols.append(int(g))
        

    return ncbi_cols

def get_pathway_info():
    # Map NCBI IDs to reactome pathways
    # File downloaded from https://reactome.org/download/current/NCBI2Reactome.txt
    archive = 'data/pathway_info.p'
    
    if os.path.exists(archive):
        pathway_info = pickle.load(open(archive,'rb'))
        return pathway_info
    
    pathway_info = {}
    if not os.path.exists('data/NCBI2Reactome.txt'):
        wget.download(A.url['reactome'],out='data/NCBI2Reactome.txt')
    with open('data/NCBI2Reactome.txt') as n2r:
        for line in n2r.readlines():
            ncbi, pathway_id, url, pathway_name, type, species = line.strip().split('\t')
            # only human pathways
            if species != 'Homo sapiens':
                continue
            # only curated pathways
            if type == 'IEA':
                continue
            pathway_info[ncbi] = [pathway_id, url, pathway_name]

    pickle.dump(pathway_info,open(archive,'wb'))
    return pathway_info
In [2]:
report('''
# Sample analysis of Cancer Dependency Map (DepMap) data 
## Request
* Identify the most frequent genetic alterations (could be mutations or copy number variations) in the cancer cell lines
* Match them with the best genetic dependencies that could be used for drug development for the cancers that carry those mutations
* Take into account the lineage of cancer cell lines (certain mutations/CNVs may be restricted to a specific lineage)

## Resources
### DepMap (https://depmap.org/portal) Data 
* Cell line metadata
* Expression (RNASeq)
* Copy number variation
* Mutations
* Genetic dependency
  * Crispr (Achilles)
  * RNAi (DEMETER2)
  
<b>Citations:</b> 
* Jordan G. Bryan, John M. Krill-Burger, Thomas M. Green, Francisca Vazquez, Jesse S. Boehm, Todd R. Golub, William C. Hahn, David E. Root, Aviad Tsherniak. (2018). Improved estimation of cancer dependencies from large-scale RNAi screens using model-based normalization and data integration. Nature Communications 9, 1. https://doi.org/10.1038/s41467-018-06916-5</font>
* DepMap, Broad (2019): DepMap 19Q3 Public. figshare. Dataset doi:10.6084/m9.figshare.9201770.v1.
* Robin M. Meyers, Jordan G. Bryan, James M. McFarland, Barbara A. Weir, ... David E. Root, William C. Hahn, Aviad Tsherniak. Computational correction of copy number effect improves specificity of CRISPR-Cas9 essentiality screens in cancer cells. Nature Genetics 2017 October 49:1779–1784. doi:10.1038/ng.3984


### Jupyter (https://jupyter.org/)
* Python programming framework for analysis prototyping and reporting

### GitHub (https://github.com/)
* Revision control for Python code
* Reporting mechanism for analysis summary and details


## Analysis overview
1. Aggregate data from depmap.org
2. Deal with various data format issues
3. Map cell lines to cancer lineages and sublineages
4. Capture low-hanging fruit genes with high copy number and (Achilles Crispr) gene dependency scores < 0
5. Sanity check filters using ERBB2
6. Compare RNAi and Crispr data fo gene dependency
7. Plot copy number vs. gene dependency by lineage
8. Generate list of candidate genes as a sortable table
9. Add cross-validation filtering data for RNAi, expression, TGCA/COSMIC mutation hotspots
10. target gene info pages

## Next steps
1. Improve thresholds (cutoffs for copy number, gene dependency, expression TPMs, etc.
2. Integrate chemical sensitivity data
3. Filter genes that are already known drug targets (identify truly novel targets)

<a href="analysis/anaysis_details.html">Details...</a>

''')

Sample analysis of Cancer Dependency Map (DepMap) data

Request

  • Identify the most frequent genetic alterations (could be mutations or copy number variations) in the cancer cell lines
  • Match them with the best genetic dependencies that could be used for drug development for the cancers that carry those mutations
  • Take into account the lineage of cancer cell lines (certain mutations/CNVs may be restricted to a specific lineage)

Resources

DepMap (https://depmap.org/portal) Data

  • Cell line metadata
  • Expression (RNASeq)
  • Copy number variation
  • Mutations
  • Genetic dependency
    • Crispr (Achilles)
    • RNAi (DEMETER2)

Citations:

  • Jordan G. Bryan, John M. Krill-Burger, Thomas M. Green, Francisca Vazquez, Jesse S. Boehm, Todd R. Golub, William C. Hahn, David E. Root, Aviad Tsherniak. (2018). Improved estimation of cancer dependencies from large-scale RNAi screens using model-based normalization and data integration. Nature Communications 9, 1. https://doi.org/10.1038/s41467-018-06916-5</font>
  • DepMap, Broad (2019): DepMap 19Q3 Public. figshare. Dataset doi:10.6084/m9.figshare.9201770.v1.
  • Robin M. Meyers, Jordan G. Bryan, James M. McFarland, Barbara A. Weir, ... David E. Root, William C. Hahn, Aviad Tsherniak. Computational correction of copy number effect improves specificity of CRISPR-Cas9 essentiality screens in cancer cells. Nature Genetics 2017 October 49:1779–1784. doi:10.1038/ng.3984

Jupyter (https://jupyter.org/)

  • Python programming framework for analysis prototyping and reporting

GitHub (https://github.com/)

  • Revision control for Python code
  • Reporting mechanism for analysis summary and details

Analysis overview

  1. Aggregate data from depmap.org
  2. Deal with various data format issues
  3. Map cell lines to cancer lineages and sublineages
  4. Capture low-hanging fruit genes with high copy number and (Achilles Crispr) gene dependency scores < 0
  5. Sanity check filters using ERBB2
  6. Compare RNAi and Crispr data fo gene dependency
  7. Plot copy number vs. gene dependency by lineage
  8. Generate list of candidate genes as a sortable table
  9. Add cross-validation filtering data for RNAi, expression, TGCA/COSMIC mutation hotspots
  10. target gene info pages

Next steps

  1. Improve thresholds (cutoffs for copy number, gene dependency, expression TPMs, etc.
  2. Integrate chemical sensitivity data
  3. Filter genes that are already known drug targets (identify truly novel targets)

Details...

In [3]:
printmd('''
## Get gene descriptions, etc.
Use data from humanmine (http://www.humanmine.org/) to map NCBI gene IDs to name, summary, symbol, uniprot
''')

Get gene descriptions, etc.

Use data from humanmine (http://www.humanmine.org/) to map NCBI gene IDs to name, summary, symbol, uniprot

In [4]:
gd = get_gene_descriptions()
printmd('sample gene info:')
display(gd.head())
ncbi2name    = {}
ncbi2symbol  = {}
symbol2ncbi  = {}
ncbi2desc    = {}
ncbi2uniprot = {}
uniprot2ncbi = {}

for i, r in gd.iterrows():
    ncbi, symbol, name, description, uniprot = list(r)
    ncbi = int(ncbi)
    ncbi2name[ncbi] = name
    ncbi2symbol[ncbi] = symbol
    ncbi2desc[ncbi] = description
    
    # ncbi <-> uniprot can be 1:many
    if ncbi2uniprot.get(ncbi) is None:
        ncbi2uniprot[ncbi] = set()
    ncbi2uniprot[ncbi].add(uniprot)
    uniprot2ncbi[uniprot] = ncbi
    
print("Done mapping gene info")

sample gene info:

primaryIdentifier symbol briefDescription description proteins.uniprotAccession
0 2632 GBE1 1,4-alpha-glucan branching enzyme 1 The protein encoded by this gene is a glycogen... Q04446
1 3248 HPGD 15-hydroxyprostaglandin dehydrogenase This gene encodes a member of the short-chain ... E9PBZ2
2 3248 HPGD 15-hydroxyprostaglandin dehydrogenase This gene encodes a member of the short-chain ... P15428
3 3248 HPGD 15-hydroxyprostaglandin dehydrogenase This gene encodes a member of the short-chain ... P15428
4 3248 HPGD 15-hydroxyprostaglandin dehydrogenase This gene encodes a member of the short-chain ... P15428
Done mapping gene info
In [5]:
report('''
## Aggregate CCLE cell lines by lineage
<b>DepMap source file:</b> sample_info.csv
* Group all cell lines (CCLE cell line IDs) by main (parent) lineage
* If a lineage has > 1 defined sublineages, also aggregate cell lines by sublineage (eg: leukemia -> AML)
''')

Aggregate CCLE cell lines by lineage

DepMap source file: sample_info.csv

  • Group all cell lines (CCLE cell line IDs) by main (parent) lineage
  • If a lineage has > 1 defined sublineages, also aggregate cell lines by sublineage (eg: leukemia -> AML)
In [6]:
sample_info = get_data('sample_info')
lineages = set(sample_info.lineage.dropna())

lineage2cell    = {}
sublineage2cell = {}
cell2lineage    = {}
cell2sublineage = {}
has_sub = set()
is_sub = set()

for l in lineages:
    ldf = sample_info[sample_info.lineage == l]
    subtypes = set(ldf.lineage_subtype.dropna())
    
    # cell lines for sublineage
    if len(subtypes) > 1:
        has_sub.add(l)
        for sub in subtypes:
            if l in sub:
                lname = sub
            else:
                lname = l + '_' + sub

            is_sub.add(lname)
            sub_df = ldf[ldf.lineage_subtype == sub]

            # map sublineage to cell lines
            sublineage2cell[lname] = list(sub_df.index)
    
            # map cell lines to sub-lineage
            for cell in sublineage2cell[lname]:
                cell2sublineage[cell] = lname
    else:
        sublineage2cell[l] = list(ldf.index)
        for cell in sublineage2cell[l]:
            cell2sublineage[cell] = l
                
    # all cell lines for lineage
    lineage2cell[l] = list(ldf.index)
    
    # parent lineage of each cell line
    for cell in lineage2cell[l]:
        cell2lineage[cell] = l
    

report('<pre>')
report("Number of cell lines: "+str(len(cell2lineage)))
report("Number of lineages: "+str(len(lineage2cell)))
report("Number of lineages with sub-lineages: "+str(len(has_sub)))
report("Number of sub-lineages "+str(len(is_sub)))
report('</pre>')

Number of cell lines: 1429

Number of lineages: 33

Number of lineages with sub-lineages: 16

Number of sub-lineages 61

</pre>

In [7]:
report('''
## Expression Data (RNASeq)
RNAseq TPM gene expression data for just protein coding genes using RSEM. Log2 transformed, using a pseudo-count of 1.

<b>DepMap source file:</b> CCLE_expression.csv
* Transpose columns (gene names) and rows (CCLE cell line IDs)
* Translate gene names to NCBI gene IDs
''')

Expression Data (RNASeq)

RNAseq TPM gene expression data for just protein coding genes using RSEM. Log2 transformed, using a pseudo-count of 1.

DepMap source file: CCLE_expression.csv

  • Transpose columns (gene names) and rows (CCLE cell line IDs)
  • Translate gene names to NCBI gene IDs
In [8]:
exp = get_data('expression').T
exp.index = ncbi_gene_ids(list(exp.index))
printmd('Sample expression data:')
display(exp.head())

Sample expression data:

ACH-001097 ACH-001485 ACH-001396 ACH-000534 ACH-000742 ACH-001818 ACH-000545 ACH-000836 ACH-001959 ACH-000470 ... ACH-000305 ACH-000916 ACH-000603 ACH-000296 ACH-000978 ACH-000904 ACH-000110 ACH-000031 ACH-000261 ACH-000682
7105 0.000000 0.000000 2.883621 0.839960 3.722466 4.032982 4.251719 4.632268 3.321928 3.681449 ... 4.152183 4.997744 0.910733 5.382321 5.002252 4.316870 5.227279 4.714795 4.447579 5.976364
64102 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.150560 0.042644 0.028569 0.070389 0.028569 0.028569 0.124328
8813 4.667324 5.755689 4.471838 5.376082 6.029674 5.933100 5.651052 6.704180 7.357288 7.294896 ... 6.024364 5.898450 5.740118 6.551670 6.523719 5.990955 6.764341 6.745910 6.748864 5.983450
57147 1.761285 2.375735 2.347666 2.687061 2.192194 2.097611 3.440952 2.792855 1.691534 2.400538 ... 2.361768 3.814550 2.505891 2.592158 2.613532 3.185867 3.537296 2.776104 2.650765 2.939227
55732 3.554589 2.634593 3.392317 4.440288 2.533563 1.536053 3.275007 4.079805 4.328406 2.849999 ... 1.659925 3.446256 3.161888 4.352617 3.243364 4.846493 4.491212 3.621759 4.607626 2.918386

5 rows × 1210 columns

In [9]:
report('''
## Mutation Data 
<b>DepMap source file:</b> CCLE_mutations.csv
* Keep track of TCGA and COSMIC hotspot genes by lineage
* Just using hotspot genes for now but track deleterious mutations by lineage for future reference
''')

Mutation Data

DepMap source file: CCLE_mutations.csv

  • Keep track of TCGA and COSMIC hotspot genes by lineage
  • Just using hotspot genes for now but track deleterious mutations by lineage for future reference
In [10]:
mutations = get_data('mutations')
In [11]:
# Filter hotspot genes
hotspots = []
hotspots.append(mutations[mutations.isTCGAhotspot])
hotspots.append(mutations[mutations.isCOSMIChotspot])
hotspots = pd.concat(hotspots).drop_duplicates()

hotspot_genes = set(hotspots.Entrez_Gene_Id)

lineage_hotspots = {}
sublineage_hotspots = {}

for g in hotspot_genes:
    lineage_hotspots[g] = {}
    sublineage_hotspots[g] = {}
    
    gdf = hotspots[hotspots.Entrez_Gene_Id == g]
    
    for i, r in gdf.iterrows():
        cell = r.DepMap_ID
        if cell2lineage.get(cell) is not None:
            lineage = cell2lineage[cell]
            if lineage_hotspots[g].get(lineage) is None:
                lineage_hotspots[g][lineage] = 0
            lineage_hotspots[g][lineage] += 1
            
        if cell2sublineage.get(cell) is not None:
            sublineage = cell2sublineage[cell]
            if sublineage_hotspots[g].get(sublineage) is None:
                sublineage_hotspots[g][sublineage] = 0
            sublineage_hotspots[g][sublineage] += 1
            
print("Done mapping hotspots")
report(str(len(hotspot_genes))+' TCGA or COSMIC hotspot genes')
Done mapping hotspots

8704 TCGA or COSMIC hotspot genes

In [12]:
# # Filter deleterious mutations
# damaging = mutations[mutations.Variant_annotation.isin(['damaging','non-conserving'])]
# damaging = damaging[damaging.isDeleterious == True]
# genes = set(hotspots.Entrez_Gene_Id)


# genes = set(damaging.Entrez_Gene_Id)

# lineage_mutations = {}
# sublineage_mutations = {}

# for g in genes:
#     lineage_mutations[g] = {}
#     sublineage_mutations[g] = {}
    
#     gdf = damaging[damaging.Entrez_Gene_Id == g]
    
#     for i, r in gdf.iterrows():
#         cell = r.DepMap_ID
#         if cell2lineage.get(cell) is not None:
#             lineage = cell2lineage[cell]
#             if lineage_mutations[g].get(lineage) is None:
#                 lineage_mutations[g][lineage] = 0
#             lineage_mutations[g][lineage] += 1
            
#         if cell2sublineage.get(cell) is not None:
#             sublineage = cell2sublineage[cell]
#             if sublineage_mutations[g].get(sublineage) is None:
#                 sublineage_mutations[g][sublineage] = 0
#             sublineage_mutations[g][sublineage] += 1
            
# print("Done mapping deleterious mutations")
In [13]:
report('''
## Copy number data
Gene level copy number data, log2 transformed with a pseudo count of 1. This is generated by mapping genes onto the segment level calls.

<b>Depmap source file:</b> CCLE_gene_cn_v2.csv
* Transpose columns (gene names) and rows (CCLE cell line IDs)
* Translate gene names to NCBI gene IDs
''')

Copy number data

Gene level copy number data, log2 transformed with a pseudo count of 1. This is generated by mapping genes onto the segment level calls.

Depmap source file: CCLE_gene_cn_v2.csv

  • Transpose columns (gene names) and rows (CCLE cell line IDs)
  • Translate gene names to NCBI gene IDs
In [14]:
cn = get_data('copy_number')
cn.columns = ncbi_gene_ids(list(cn.columns))

# cn.columns = ncbi_gene_ids(list(cn.columns))
# print(cn.shape)
# keep_genes = []
# for g in cn.columns:
#     if g in hotspot_genes:
#         keep_genes.append(g)
# cn = cn[keep_genes]
# print(cn.shape)
cn = cn.T
printmd('Sample copy number data:')
display(cn.head())
report('Copy number data descritive stats:')
display(cn.iloc[:,:10].describe())

Sample copy number data:

ACH-001793 ACH-002176 ACH-000652 ACH-001295 ACH-000798 ACH-001399 ACH-000111 ACH-002358 ACH-000367 ACH-000584 ... ACH-000280 ACH-001563 ACH-002124 ACH-000286 ACH-001148 ACH-001359 ACH-001560 ACH-001791 ACH-001125 ACH-000110
1 1.086422 1.526188 0.776609 0.964857 0.986651 1.097441 1.577719 0.990491 0.972714 1.232228 ... 1.365183 1.027095 1.029024 0.841298 1.045162 1.143302 1.391900 1.001219 1.257749 1.931762
10 1.103066 1.044267 0.520576 1.001427 0.974178 0.775405 1.209647 1.001602 0.564791 0.379099 ... 0.690553 0.995183 1.068430 0.781903 0.986327 0.803954 0.707500 1.262303 1.123967 0.728684
100 0.910926 1.089161 1.555780 1.010086 0.987801 1.877675 0.826138 0.999580 1.128533 1.254932 ... 1.307992 0.996104 1.070793 2.327111 1.037611 1.140260 1.166051 0.953422 1.361431 1.710860
1000 0.869815 1.093874 0.534269 1.618418 0.975264 0.765916 1.781738 0.999387 0.583207 1.262886 ... 0.977053 0.964261 0.992269 0.639639 1.005763 0.793022 1.186009 1.140554 0.753544 0.815800
10000 1.178887 0.739725 1.072077 1.087050 0.992261 0.823112 1.618552 1.002886 1.185828 1.369970 ... 1.320073 0.980081 1.004353 1.169857 1.051142 0.775699 1.177641 1.118604 1.053262 0.989854

5 rows × 1660 columns

Copy number data descritive stats:

ACH-001793 ACH-002176 ACH-000652 ACH-001295 ACH-000798 ACH-001399 ACH-000111 ACH-002358 ACH-000367 ACH-000584
count 27639.000000 2.763900e+04 2.763900e+04 2.763900e+04 27639.000000 27639.000000 2.763900e+04 27639.000000 2.763900e+04 27639.000000
mean 1.098416 1.005611e+00 9.739989e-01 1.061859e+00 1.019288 1.057222 1.127191e+00 0.980485 9.532559e-01 1.057527
std 1.119507 3.482645e-01 2.416796e-01 6.685549e-01 0.123101 0.306522 3.919352e-01 0.092264 2.693422e-01 0.366487
min 0.171960 7.998209e-10 8.220704e-10 7.984062e-10 0.000036 0.000020 1.102449e-09 0.000021 8.282854e-10 0.000009
25% 0.889299 7.318536e-01 7.900758e-01 9.845272e-01 0.982870 0.812673 8.295737e-01 0.993549 6.015790e-01 0.700131
50% 1.053484 1.045336e+00 1.027553e+00 9.942944e-01 0.986097 1.003401 9.373404e-01 0.999580 1.056217e+00 0.990265
75% 1.113955 1.111985e+00 1.068509e+00 1.002200e+00 0.989550 1.120454 1.321072e+00 1.002886 1.131187e+00 1.262886
max 35.734800 4.952679e+00 4.804283e+00 3.963217e+01 1.462469 3.217691 3.229879e+00 1.006658 1.729972e+00 3.354475
In [15]:
report('''
## Search for high copy number genes in each of the lineages
* Seleting genes with copy number > 2 in <b><i>all</i></b> cell lines is a lineage is a bit too stringent
* Retain genes that have copy number >= 2 in 80% or more cell lines in a lineage
* Extract the data for any cell lines with high copy number genes to use as plotting baseline
''')

Search for high copy number genes in each of the lineages

  • Seleting genes with copy number > 2 in all cell lines is a lineage is a bit too stringent
  • Retain genes that have copy number >= 2 in 80% or more cell lines in a lineage
  • Extract the data for any cell lines with high copy number genes to use as plotting baseline
In [20]:
# for each cell line, get all of the genes with copy number > 2
keep_common_genes = []
keep_common_cells = []
keep_lineage = set()
keep_pairs = set()

report('<pre>')
for sublin in sublineage2cell:
    cells = sublineage2cell[sublin]
    #print(sublin,len(cells))
    
    best = {}
    for cell in cells:
        try:
            cdf = cn[cell]
            cdf = cdf[cdf >= 2]
            if len(cdf.index) > 0:
                best[cell] = set(cdf.index)
        except Exception as e:
            #print('Error',e)
            continue
            
    if len(best) == 0:
        continue
        
    best_genes = list(best.values())
    set1 = best_genes.pop()
    all_genes = set1.union(*best_genes)

    total = len(best_genes) + 1
    common_genes = []
    for g in all_genes:
        gcount = 0
        for gset in [set1,*best_genes]:
            if g in gset:
                gcount += 1
        if gcount / total >= 0.8:
            common_genes.append(g)
        
        
    keep_common_genes.extend(common_genes)
    common_symbols = [ncbi2symbol.get(g) for g in common_genes]
    if len(common_genes) > 0:
        keep_lineage.add(sublin)
        keep_common_cells.extend(cells)
        report(sublin+'; '+str(len(cells))+' cell lines; '+str(len(common_genes))+' high copy number genes')
        # Remember the gene:cell pairs to assemble the final table
        for cg in common_genes:
            for cell in cells:
                keep_pairs.add((cg,cell))
report('</pre>')

keep_common_genes = set(keep_common_genes)
keep_common_cells = set(keep_common_cells)

bone_chordoma; 4 cell lines; 11 high copy number genes

eye_uveal_melanoma; 5 cell lines; 1016 high copy number genes

ovary_immortalized; 1 cell lines; 9 high copy number genes

ovary_non_epithelial; 2 cell lines; 74 high copy number genes

lung_immortalized; 1 cell lines; 6 high copy number genes

adrenal_cortex; 1 cell lines; 200 high copy number genes

soft_tissue_fibrosarcoma; 1 cell lines; 2 high copy number genes

soft_tissue_liposarcoma; 5 cell lines; 44 high copy number genes

soft_tissue_sarcoma_undifferentiated; 2 cell lines; 164 high copy number genes

soft_tissue_epitheliod_sarcoma; 2 cell lines; 2 high copy number genes

esophagus_adenocarcinoma; 7 cell lines; 7 high copy number genes

gastric_adenosquamous; 1 cell lines; 508 high copy number genes

skin_squamous; 3 cell lines; 251 high copy number genes

skin_Merkel; 7 cell lines; 251 high copy number genes

skin_epidermoid_carcinoma; 1 cell lines; 234 high copy number genes

peripheral_nervous_system_PNET; 1 cell lines; 22 high copy number genes

upper_aerodigestive_buccal_mucosa; 1 cell lines; 592 high copy number genes

central_nervous_system_PNET; 1 cell lines; 2 high copy number genes

central_nervous_system_immortalized; 1 cell lines; 26 high copy number genes

central_nervous_system_meningioma; 3 cell lines; 151 high copy number genes

breast_HER2Amp; 13 cell lines; 11 high copy number genes

breast_ERneg; 1 cell lines; 960 high copy number genes

breast_immortalized; 2 cell lines; 8 high copy number genes

lymphoblastic_lymphoma; 1 cell lines; 311 high copy number genes

lymphoma_B-cell_ALL; 2 cell lines; 3 high copy number genes

</pre>

In [21]:
report('''
## DEMETER2 RNAi gene dependency data
Cancer cell line genetic dependencies estimated using the DEMETER2 model. DEMETER2 is applied to three large-scale RNAi screening datasets: the Broad Institute Project Achilles, Novartis Project DRIVE, and the Marcotte et al. breast cell line dataset. The model is also applied to generate a combined dataset of gene dependencies covering a total of 712 unique cancer cell lines.

<b>DepMap source file:</b> D2_combined_gene_dep_scores.csv 

* Data source uses CCLE names rather than DepMap cell line IDS
* Translate the cell line names to IDS for consistency with other data sources
* Also deal with rows in the table with multiple gene names (eg 'GTF2IP4&GTF2IP1 (100093631&2970)')
''')

DEMETER2 RNAi gene dependency data

Cancer cell line genetic dependencies estimated using the DEMETER2 model. DEMETER2 is applied to three large-scale RNAi screening datasets: the Broad Institute Project Achilles, Novartis Project DRIVE, and the Marcotte et al. breast cell line dataset. The model is also applied to generate a combined dataset of gene dependencies covering a total of 712 unique cancer cell lines.

DepMap source file: D2_combined_gene_dep_scores.csv

  • Data source uses CCLE names rather than DepMap cell line IDS
  • Translate the cell line names to IDS for consistency with other data sources
  • Also deal with rows in the table with multiple gene names (eg 'GTF2IP4&GTF2IP1 (100093631&2970)')
In [22]:
# Ingest data
d2 = get_data('d2_rnai')
print("Initial number of rows in dataframe:",d2.shape[0])
# split rows with multuple genes
cols = ['gene'] + list(d2.columns)
rows = []

print("Splitting multigene rows...")
for i, r in d2.iterrows():
    if '&' in i:
        symbols, ncbi = i.split(' ')
        symbols = symbols.split('&')
        ncbi = re.sub('\(|\)','',ncbi)
        ncbi = ncbi.split('&')
        assert len(symbols) == len(ncbi), "Length mismatch"
        for s, symbol in enumerate(symbols):
            gid = ncbi[s]
            row = [symbol+' ('+gid+')'] + list(r)
            rows.append(row)
    else:
        rows.append([i]+list(r))
d2 = pd.DataFrame(data=rows,columns=cols).set_index('gene')
print("Final number of rows in dataframe:",d2.shape[0])
Initial number of rows in dataframe: 17309
Splitting multigene rows...
Final number of rows in dataframe: 17731
In [23]:
# Map cell line names to IDs
sample_info = get_data('sample_info')
ccle_name2id = {}
for i, r in sample_info.iterrows():
    ccle_name2id[r['CCLE Name']] = i 

cell_line_names = list(d2.columns)

# Rename columns to use CCLE IDs and rows to use NCBI gene IDs
if isinstance(list(d2.index)[0],str):
    d2.columns = [ccle_name2id.get(i) or i for i in cell_line_names]
    d2.index = ncbi_gene_ids(list(d2.index))
print(d2.shape[0],"genes")
print(d2.shape[1],"cell lines")
d2.head()
17731 genes
712 cell lines
Out[23]:
ACH-001270 ACH-001000 ACH-001001 ACH-002319 ACH-001827 ACH-000956 ACH-000948 ACH-002320 ACH-000070 ACH-000411 ... ACH-000899 ACH-000765 ACH-000534 ACH-000762 ACH-000630 ACH-000570 ACH-001249 ACH-000097 ACH-000828 ACH-002331
1 NaN NaN 0.146042 -0.190388 0.907063 -0.019331 -0.016734 0.091580 0.035023 -0.122046 ... -0.088267 0.002171 NaN 0.120294 0.012540 0.111530 NaN -0.079313 -0.141559 0.214268
10 NaN NaN 0.102854 0.384106 0.403192 0.001925 -0.153933 -0.317969 0.012341 -0.205077 ... -0.003747 -0.321445 NaN -0.003256 -0.220472 0.073460 NaN -0.130921 0.127358 -0.405974
100 NaN NaN 0.168839 -0.120700 0.004394 -0.188700 -0.060818 -0.755058 0.129770 0.076273 ... -0.014085 0.039679 NaN 0.076521 0.106995 0.227977 NaN -0.134479 0.083506 -0.404291
1000 -0.194962 -0.028171 0.063047 -0.237251 -0.017059 -0.103047 -0.049460 0.130107 0.146864 -0.126198 ... -0.073435 -0.140041 -0.154436 -0.040308 -0.078707 0.000769 -0.139126 0.047022 -0.097644 -0.062622
10000 -0.256108 0.100751 -0.008077 0.060267 -0.094749 -0.066591 0.166029 0.149969 -0.053022 0.092426 ... 0.028714 -0.054628 0.450581 0.002932 0.129679 -0.072564 0.017161 0.123615 0.046846 0.125711

5 rows × 712 columns

In [24]:
report('''
## Achilles Crispr gene dependency data
CERES data with principle components strongly related to known batch effects removed, then shifted and scaled per cell line so the median nonessential KO effect is 0 and the median essential KO effect is -1.

<b>DepMap source file:</b> Achilles_gene_effect.csv 

* Translate gene names (column labels) to NCBI IDS
* Transpose rows and columns so each cell line is a column label with vertivally stacked gene data
''')

Achilles Crispr gene dependency data

CERES data with principle components strongly related to known batch effects removed, then shifted and scaled per cell line so the median nonessential KO effect is 0 and the median essential KO effect is -1.

DepMap source file: Achilles_gene_effect.csv

  • Translate gene names (column labels) to NCBI IDS
  • Transpose rows and columns so each cell line is a column label with vertivally stacked gene data
In [25]:
achilles = get_data('achilles_crispr').T
genes = list(achilles.index)
achilles.index = ncbi_gene_ids(genes)
print(achilles.shape[0],"genes")
print(achilles.shape[1],"cell lines")
printmd('Sample Achilles data:')
achilles.head()
18333 genes
625 cell lines

Sample Achilles data:

Out[25]:
ACH-000004 ACH-000005 ACH-000007 ACH-000009 ACH-000011 ACH-000012 ACH-000013 ACH-000014 ACH-000015 ACH-000017 ... ACH-001736 ACH-001737 ACH-001740 ACH-001745 ACH-001750 ACH-001765 ACH-001814 ACH-001838 ACH-001956 ACH-001957
1 0.168684 -0.068759 0.053893 0.059874 0.277165 0.008073 0.062131 0.143078 -0.090890 0.178427 ... 0.154567 -0.050307 0.005125 0.208843 0.044674 0.136364 0.216507 -0.086149 -0.076893 0.055750
29974 0.089128 0.218792 0.081444 -0.011153 0.085354 0.167177 0.038687 -0.035837 0.007894 0.106952 ... 0.019334 0.189813 0.349099 0.153637 0.126563 0.021261 -0.172366 0.082798 0.109464 0.004545
2 -0.196966 0.178252 -0.060170 -0.054367 0.007972 0.088705 -0.043841 0.010997 -0.185690 -0.068145 ... -0.124875 -0.079220 -0.194522 -0.134906 -0.082100 -0.107147 -0.265359 -0.147978 0.021325 0.067990
144568 -0.021260 0.158390 0.153435 0.060886 0.445843 0.307599 0.200285 0.182625 0.111529 0.109807 ... 0.051671 0.100741 0.217812 0.167583 0.132673 0.076223 0.045942 0.256595 0.204297 0.199098
127550 0.038541 -0.193862 0.087362 0.039767 -0.036717 0.015440 -0.070484 -0.034048 -0.033507 -0.195903 ... -0.196632 0.164481 -0.052438 -0.130067 -0.172350 -0.116583 0.123916 -0.054596 -0.080814 -0.042784

5 rows × 625 columns

In [26]:
report('''
### How many cell lines and genes are shared between D2 (RNAi) and Achilles (Crispr) gene dependency data sets?
''')

How many cell lines and genes are shared between D2 (RNAi) and Achilles (Crispr) gene dependency data sets?

In [27]:
d2_cells = set(d2.columns)
d2_genes = set(d2.index)
ac_cells = set(achilles.columns)
ac_genes = set(achilles.index)
shared_cells = d2_cells.intersection(ac_cells)
shared_genes = d2_genes.intersection(ac_genes)
report('<pre>')
report(str(len(shared_cells))+" cell lines are shared")
report(str(len(shared_genes))+" genes are shared")
report('</pre>')

423 cell lines are shared

16052 genes are shared

</pre>

In [29]:
report('### How do RNAi and Crispr screen compare for gene dependency score?')

How do RNAi and Crispr screen compare for gene dependency score?

In [28]:
# filter shared genes to just include our high copy number genes
my_d2 = d2.copy()
keep_genes = set()
for g in shared_genes:
    if g in keep_common_genes:
        keep_genes.add(g)

keep_cells = set()
for c in shared_cells:
    if c in keep_common_cells:
        keep_cells.add(c)
        
d2_s = my_d2[keep_cells]
d2_s = d2_s.T[keep_genes].T
ac_s = achilles[keep_cells]
ac_s = ac_s.T[keep_genes].T


d2_data = d2_s.to_numpy().flatten()
ac_data = ac_s.to_numpy().flatten()

# remove NaN and Inf
keep_d2 = []
keep_ac = []
for i, num1 in enumerate(d2_data):
    num2 = ac_data[i]
    if not np.isfinite(num1) and np.isfinite(num2):
        continue
    keep_d2.append(num1)
    keep_ac.append(num2)
d2_data = keep_d2
ac_data = keep_ac    
In [30]:
report('''
## Assemble the candidate gene table
1. Start with gene:cell pairs that were identified has having >= 80% high copy number for all cell lines in the sublineage
2. Get copynumber, Achilles (Crispr), D2 (RNAi), hotspot and expression data
3. Remove genes with copy number < 2 or Achilles (Crispr) dependency > 0
''')

Assemble the candidate gene table

  1. Start with gene:cell pairs that were identified has having >= 80% high copy number for all cell lines in the sublineage
  2. Get copynumber, Achilles (Crispr), D2 (RNAi), hotspot and expression data
  3. Remove genes with copy number < 2 or Achilles (Crispr) dependency > 0
In [34]:
# save the data
keep_saved_rows = []
for pair in keep_pairs:
    gene, cell = pair
    lineage = cell2sublineage[cell]
    symbol = ncbi2symbol[gene]

    # Get the copy number data for the gene:cell
    genes = set(cn.index)
    cells = set(cn.columns)
    if gene in genes and cell in cells:
        copynum = cn.loc[gene,cell]
        if copynum < 2:
            continue
    else:
        continue
    
    # get the Achilles gene dependency
    genes = set(achilles.index)
    cells = set(achilles.columns)
    if gene in genes and cell in cells:
        adata = achilles.loc[gene,cell]
        if adata > 0:
            continue
    else:
        continue
       
    # Get the RNAi gene dependency
    genes = set(d2.index)
    cells = set(d2.columns)
    if gene in genes and cell in cells:
        ddata = d2.loc[gene,cell]
    else:
        ddata = np.nan

    # Check if the gene is a mutation hotspot
    is_hotspot = gene in hotspot_genes
    
    # Check for expression
    genes = set(exp.index)
    cells = set(exp.columns)
    if gene in genes and cell in cells:
        edata = exp.loc[gene,cell]
    else:
        edata = np.nan
    is_expressed = edata > 0
    
    row = [cell,lineage,symbol,gene,copynum,adata,ddata,is_hotspot,is_expressed]
    keep_saved_rows.append(row)
    
In [35]:
cols = ['DepMap_ID','sublineage','HUGO_symbol','Entrez_ID','copy_number',
        'achilles_gene_dep','rnai_gene_dep','hotspot','rnaseq_expressed']
candidates = pd.DataFrame(data=keep_saved_rows,columns=cols)
set(candidates.sublineage)
Out[35]:
{'bone_chordoma',
 'breast_HER2Amp',
 'central_nervous_system_meningioma',
 'eye_uveal_melanoma',
 'ovary_non_epithelial',
 'skin_epidermoid_carcinoma',
 'skin_squamous',
 'soft_tissue_fibrosarcoma',
 'soft_tissue_sarcoma_undifferentiated',
 'upper_aerodigestive_buccal_mucosa'}
In [36]:
%matplotlib inline
plt.scatter(d2_data,ac_data)
slope, intercept, r_value, p_value, std_err = stats.linregress(d2_data,ac_data)
plt.xlabel('Crispr (Achilles)')
plt.ylabel('RNAi (DEMETER 2)')
plt.title('Gene dependency')
plt.text(-2.6,2.1,'slope='+'%.3f'%slope)
plt.text(-2.6,1.8,'r-squared ='+'%.3f' %r_value)
plt.text(-2.6,1.5,'p-value ='+'%.3f'%p_value)
plt.savefig('plots/gene_dependency.png',dpi=100)
plt.close()

report('<img src="plots/gene_dependency.png">')
report('* significant p-value means reject H0 that slope == 0')
report('* We will use the Achilles (Crispr) gene dependency score and check for positive agreement with RNAi later')

  • significant p-value means reject H0 that slope == 0
  • We will use the Achilles (Crispr) gene dependency score and check for positive agreement with RNAi later
In [39]:
keep_genes = set()
for g in ac_genes:
    if g in keep_common_genes:
        keep_genes.add(g)

keep_cells = set()
for c in ac_cells:
    if c in keep_common_cells:
        keep_cells.add(c)
keep_cells = list(keep_cells)
        
ac_s = achilles[keep_cells]
ac_s = ac_s.T[keep_genes].T
cn_s = cn[keep_cells]
cn_s = cn_s.T[keep_genes].T

ac_data = ac_s.to_numpy().flatten()
cn_data = cn_s.to_numpy().flatten()

# remove NaN and Inf
keep_ac = []
keep_cn = []
for i, num1 in enumerate(ac_data):
    num2 = cn_data[i]
    if not np.isfinite(num1) and np.isfinite(num2):
        continue
    keep_ac.append(num1)
    keep_cn.append(num2)
    
ac_data = keep_ac
cn_data = keep_cn  
cn_orig = cn_data
ac_orig = ac_data
In [40]:
report('''
### Sanity checking with ERBB2 (2064)
Evaluating breast cancer lineages where at least one cell line had copy number > 2:
* Is ERB2B in the hotspot gene set?
* We expect ERBB2 to have high copy number in breast cancer lineages
  ** What is the mean ERB2B copy number in breast cancers?
* The gene dependency score should be < 0
  ** What is the mean gene dependency score in breast cancers?
''')

Sanity checking with ERBB2 (2064)

Evaluating breast cancer lineages where at least one cell line had copy number > 2:

  • Is ERB2B in the hotspot gene set?
  • We expect ERBB2 to have high copy number in breast cancer lineages ** What is the mean ERB2B copy number in breast cancers?
  • The gene dependency score should be < 0 ** What is the mean gene dependency score in breast cancers?
In [41]:
report("ERBB2 is in hotspot gene set? <b>"+str(2064 in hotspot_genes).upper()+"</b>")
report('<pre>')

erb = cn.loc[2064]
nums = {}
is_high = set()
for c in cell2sublineage:
    lin = cell2sublineage.get(c)
    if lin is None or erb.get(c) is None:
        continue
        
    if 'breast' in lin:
        if nums.get(lin) is None:
            nums[lin] = []
        nums[lin].append(erb[c])
        if erb[c] >= 2:
            is_high.add(lin)
            #report(','.join([c,lin,str('%.2f'%erb[c])]))
for lin in nums:
    if lin in is_high:
        report("ERBB2 mean copy number for "+lin+" ("+str(len(nums[lin]))+" cell lines): "+str('%.2f'%np.mean(nums[lin])))
        
erb_gd = achilles.loc[2064]
nums = {}
for c in erb_gd.keys():
    lin = cell2sublineage.get(c)
    if lin is None or not 'breast' in lin:
        continue
        
    if nums.get(lin) is None:
        nums[lin] = []
    nums[lin].append(erb_gd[c])

for lin in nums:
    #if lin in is_high:
    report("ERBB2 mean gene dependency for "+lin+" ("+str(len(nums[lin]))+" cell lines): "+str('%.2f'%np.mean(nums[lin])))
        
report('</pre>')
    

ERBB2 is in hotspot gene set? TRUE

ERBB2 mean copy number for breast_HER2Amp (11 cell lines): 14.84

ERBB2 mean copy number for breast_TNBC (27 cell lines): 1.89

ERBB2 mean copy number for breast_TPBC (5 cell lines): 9.59

ERBB2 mean gene dependency for breast_HER2Amp (6 cell lines): -0.83

ERBB2 mean gene dependency for breast_ERpos (7 cell lines): -0.27

ERBB2 mean gene dependency for breast_TNBC (15 cell lines): -0.28

</pre>

In [42]:
report('''
## Lineages with observed high copy number genes

* Go through the list of lineages with high copy number genes
* Plot gene dependency vs. copy number
* Highlight genes/cell-lines with copy number > 2 and gene dependency < 0 for specific lineages

### Copy number vs gene dependency
''')

Lineages with observed high copy number genes

  • Go through the list of lineages with high copy number genes
  • Plot gene dependency vs. copy number
  • Highlight genes/cell-lines with copy number > 2 and gene dependency < 0 for specific lineages

Copy number vs gene dependency

In [43]:
for sublin in keep_lineage:
    cells = sublineage2cell[sublin]
    
    best = {}
    genes = set()
    for c in cells:
        try:
            cdf = cn[c]
            cdf = cdf[cdf >= 2]
            best[c] = set(cdf.index)
            for g in best[c]:
                genes.add(g)
                
        except Exception as e:
            #print('Error',e)
            continue
    
    if len(best) == 0:
        continue
    
    keep_cells = set()
    for c in ac_cells:
        if c in cells:
            keep_cells.add(c)
    if len(keep_cells) == 0:
        #print("no shared cells")
        continue

    keep_genes = set()
    for g in ac_genes:
        if g in genes:
            keep_genes.add(g)
    if len(keep_genes) == 0:
        print("no shared genes")
        continue
    
    try:
        ac_s = achilles[keep_cells]
        ac_s = ac_s.T[keep_genes].T
        cn_s = cn[keep_cells]
        cn_s = cn_s.T[keep_genes].T
    except:
        continue

    ac_data = ac_s.to_numpy().flatten()
    cn_data = cn_s.to_numpy().flatten()
    
    # remove NaN and Inf
    keep_ac = []
    keep_cn = []
    for i, d1 in enumerate(ac_data):
        d2 = cn_data[i]
        # Skip gene dependency > 0
        if d1 >= 0:
            continue
        # Skip copy number < 2
        if d2 < 2:
            continue
        
        if not np.isfinite(d1) and np.isfinite(d2):
            continue
        keep_ac.append(d1)
        keep_cn.append(d2)
    
    ac_data = keep_ac
    cn_data = keep_cn  
    
    plt.scatter(cn_orig,ac_orig,c='silver',label='other lineages')
    plt.scatter(cn_data,ac_data,c='r',label='this lineage: gene dep.<0; copy num >= 2')
    plt.legend()
    outfile = 'plots/'+sublin+'.png'
    plt.xlabel('copy number')
    plt.ylabel('Achilles gene dependency')
    plt.title(sublin)
    plt.savefig(outfile,dpi=100)
    plt.close()
    report('<img src="'+outfile+'">')

In [49]:
# Save the table of candidates to an HTML file

# Make gene symbol a link
rows = []
cols = candidates.columns
for i, r in candidates.iterrows():
    sym = r.HUGO_symbol
    sym = '<a href="reports/'+sym+'.html" target="gene_info">'+sym+'</a>'
    r.HUGO_symbol = sym
    rows.append(r)
report = pd.DataFrame(data=rows,columns=cols)

table = report.to_html(index=False)
table = table.replace('<table','<table id="candidates"')
table = table.replace('&gt;','>')
table = table.replace('&lt;','<')

with open('../_includes/template.html','r') as html_in:
    html = html_in.read()

html = html.replace('TABLE',table)

with open('../_includes/candidates.html','w') as html_out:
    html_out.write(html)
In [44]:
report('## Candidate target genes')
report('''
* All genes in this table have copy number > 2 and Achilles gene dependency < 0 for at least 80% of the cell lines in 1 cell lineage'
* Other data for cross-validation:
  * RNAi gene dependency score
  * Whether the gene is a TCGA or COSMIC mutation hotspot
  * Whether the gene has > 0 TPM RNASeq expression
  
{% include candidates.html %}
''')

Candidate target genes

  • All genes in this table have copy number > 2 and Achilles gene dependency < 0 for at least 80% of the cell lines in 1 cell lineage'
  • Other data for cross-validation:
    • RNAi gene dependency score
    • Whether the gene is a TCGA or COSMIC mutation hotspot
    • Whether the gene has > 0 TPM RNASeq expression

{% include candidates.html %}

In [45]:
genes = candidates[['HUGO_symbol','Entrez_ID']].drop_duplicates()
pathway_info = get_pathway_info()
for symbol, gene in genes.values:
    gene = str(gene)
    with open('../reports/template.html','r') as html_in:
        html = html_in.read()
    html = html.replace('SYMBOL',symbol)
    
    pathway = pathway_info.get(gene)
    if pathway is not None:
        pid, url, name = pathway
        html = html.replace('PNAME','<a href="'+url+'" target="_BLANK">'+name+'</a>')
        html = html.replace('START_HIDE','')
        html = html.replace('END_HIDE','')
    else:
        html = html.replace('START_HIDE','<!--')
        html = html.replace('END_HIDE','-->')
        
    symbol = symbol.replace(' ','_')
    try:
        with open('../reports/'+symbol+'.md','w') as html_out:
            html_out.write(html)
    except:
        pass
In [46]:
report('\n\n<h2>Links to gene information</h2>')
report('DepMap summary, expression, Reactome pathways<br>')
for g in set(candidates.HUGO_symbol):
    report('<a href="reports/'+g+'.html">'+g+'</a><br>')

Links to gene information

DepMap summary, expression, Reactome pathways

XPA

PVR

MIF

VXN

DUT

EVL

ICK

NNT

CA1

DAP

C9

EED

ZFR

AQR

SRF

CA8

MSC

CPQ

PGR

GAL

NRM

CRH

WRN

MGA

TRO

CIC

ERF

JUN

PGC

TG

GML

AIP

FUS

RP1

LTK

HDC

MYC