@article {696, title = {Community Assessment of the Predictability of Cancer Protein and Phosphoprotein Levels from Genomics and Transcriptomics.}, journal = {Cell Syst}, volume = {11}, year = {2020}, month = {2020 08 26}, pages = {186-195.e9}, abstract = {

Cancer is driven by genomic alterations, but the processes causing this disease are largely performed by proteins. However, proteins are harder and more expensive to measure than genes and transcripts. To catalyze developments of methods to infer protein levels from other omics measurements, we leveraged crowdsourcing via the NCI-CPTAC DREAM proteogenomic challenge. We asked for methods to predict protein and phosphorylation levels from genomic and transcriptomic data in cancer patients. The best performance was achieved by an ensemble of models, including as predictors transcript level of the corresponding genes, interaction between genes, conservation across tumor types, and phosphosite proximity for phosphorylation prediction. Proteins from metabolic pathways and complexes were the best and worst predicted, respectively. The performance of even the best-performing model was modest, suggesting that many proteins are strongly regulated through translational control and degradation. Our results set a reference for the limitations of computational inference in proteogenomics. A record of this paper{\textquoteright}s transparent peer review process is included in the Supplemental Information.

}, keywords = {Crowdsourcing, Female, Genomics, Humans, Machine Learning, Male, Neoplasms, Phosphoproteins, Proteins, Proteomics, Transcriptome}, issn = {2405-4720}, doi = {10.1016/j.cels.2020.06.013}, author = {Yang, Mi and Petralia, Francesca and Li, Zhi and Li, Hongyang and Ma, Weiping and Song, Xiaoyu and Kim, Sunkyu and Lee, Heewon and Yu, Han and Lee, Bora and Bae, Seohui and Heo, Eunji and Kaczmarczyk, Jan and St{\k e}pniak, Piotr and Warcho{\l}, Micha{\l} and Yu, Thomas and Calinawan, Anna P and Boutros, Paul C and Payne, Samuel H and Reva, Boris and Boja, Emily and Rodriguez, Henry and Stolovitzky, Gustavo and Guan, Yuanfang and Kang, Jaewoo and Wang, Pei and Feny{\"o}, David and Saez-Rodriguez, Julio} } @article {713, title = {Drug repurposing for COVID-19 using machine learning and mechanistic models of signal transduction circuits related to SARS-CoV-2 infection.}, journal = {Signal Transduct Target Ther}, volume = {5}, year = {2020}, month = {2020 12 11}, pages = {290}, keywords = {Computational Chemistry, COVID-19, drug repositioning, Humans, Machine Learning, Molecular Docking Simulation, Molecular Targeted Therapy, Proteins, SARS-CoV-2, Signal Transduction}, issn = {2059-3635}, doi = {10.1038/s41392-020-00417-y}, author = {Loucera, Carlos and Esteban-Medina, Marina and Rian, Kinza and Falco, Matias M and Dopazo, Joaquin and Pe{\~n}a-Chilet, Maria} } @article {610, title = {Exploring the druggable space around the Fanconi anemia pathway using machine learning and mechanistic models.}, journal = {BMC Bioinformatics}, volume = {20}, year = {2019}, month = {2019 Jul 02}, pages = {370}, abstract = {

BACKGROUND: In spite of the abundance of genomic data, predictive models that describe phenotypes as a function of gene expression or mutations are difficult to obtain because they are affected by the curse of dimensionality, given the disbalance between samples and candidate genes. And this is especially dramatic in scenarios in which the availability of samples is difficult, such as the case of rare diseases.

RESULTS: The application of multi-output regression machine learning methodologies to predict the potential effect of external proteins over the signaling circuits that trigger Fanconi anemia related cell functionalities, inferred with a mechanistic model, allowed us to detect over 20 potential therapeutic targets.

CONCLUSIONS: The use of artificial intelligence methods for the prediction of potentially causal relationships between proteins of interest and cell activities related with disease-related phenotypes opens promising avenues for the systematic search of new targets in rare diseases.

}, keywords = {Databases, Factual, Fanconi Anemia, Genomics, Humans, Machine Learning, Phenotype, Proteins, Signal Transduction}, issn = {1471-2105}, doi = {10.1186/s12859-019-2969-0}, author = {Esteban-Medina, Marina and Pe{\~n}a-Chilet, Maria and Loucera, Carlos and Dopazo, Joaquin} } @article {474, title = {Using activation status of signaling pathways as mechanism-based biomarkers to predict drug sensitivity.}, journal = {Sci Rep}, volume = {5}, year = {2015}, month = {2015 Dec 18}, pages = {18494}, abstract = {

Many complex traits, as drug response, are associated with changes in biological pathways rather than being caused by single gene alterations. Here, a predictive framework is presented in which gene expression data are recoded into activity statuses of signal transduction circuits (sub-pathways within signaling pathways that connect receptor proteins to final effector proteins that trigger cell actions). Such activity values are used as features by a prediction algorithm which can efficiently predict a continuous variable such as the IC50 value. The main advantage of this prediction method is that the features selected by the predictor, the signaling circuits, are themselves rich-informative, mechanism-based biomarkers which provide insight into or drug molecular mechanisms of action (MoA).

}, keywords = {Algorithms, Antineoplastic Agents, biomarkers, Cell Line, Tumor, Cell Survival, gene expression, Humans, Lethal Dose 50, Neoplasms, Phosphorylation, Proteins, Signal Transduction}, issn = {2045-2322}, doi = {10.1038/srep18494}, author = {Amadoz, Alicia and Sebasti{\'a}n-Leon, Patricia and Vidal, Enrique and Salavert, Francisco and Dopazo, Joaquin} } @article {519, title = {SNPeffect 4.0: on-line prediction of molecular and structural effects of protein-coding variants.}, journal = {Nucleic Acids Res}, volume = {40}, year = {2012}, month = {2012 Jan}, pages = {D935-9}, abstract = {

Single nucleotide variants (SNVs) are, together with copy number variation, the primary source of variation in the human genome and are associated with phenotypic variation such as altered response to drug treatment and susceptibility to disease. Linking structural effects of non-synonymous SNVs to functional outcomes is a major issue in structural bioinformatics. The SNPeffect database (http://snpeffect.switchlab.org) uses sequence- and structure-based bioinformatics tools to predict the effect of protein-coding SNVs on the structural phenotype of proteins. It integrates aggregation prediction (TANGO), amyloid prediction (WALTZ), chaperone-binding prediction (LIMBO) and protein stability analysis (FoldX) for structural phenotyping. Additionally, SNPeffect holds information on affected catalytic sites and a number of post-translational modifications. The database contains all known human protein variants from UniProt, but users can now also submit custom protein variants for a SNPeffect analysis, including automated structure modeling. The new meta-analysis application allows plotting correlations between phenotypic features for a user-selected set of variants.

}, keywords = {Databases, Protein, Humans, Internet, Meta-Analysis as Topic, Phenotype, Polymorphism, Single Nucleotide, Protein Conformation, Proteins}, issn = {1362-4962}, doi = {10.1093/nar/gkr996}, author = {De Baets, Greet and Van Durme, Joost and Reumers, Joke and Maurer-Stroh, Sebastian and Vanhee, Peter and Dopazo, Joaquin and Schymkowitz, Joost and Rousseau, Frederic} } @article {542, title = {Functional analysis of multiple genomic signatures demonstrates that classification algorithms choose phenotype-related genes.}, journal = {Pharmacogenomics J}, volume = {10}, year = {2010}, month = {2010 Aug}, pages = {310-23}, abstract = {

Gene expression signatures of toxicity and clinical response benefit both safety assessment and clinical practice; however, difficulties in connecting signature genes with the predicted end points have limited their application. The Microarray Quality Control Consortium II (MAQCII) project generated 262 signatures for ten clinical and three toxicological end points from six gene expression data sets, an unprecedented collection of diverse signatures that has permitted a wide-ranging analysis on the nature of such predictive models. A comprehensive analysis of the genes of these signatures and their nonredundant unions using ontology enrichment, biological network building and interactome connectivity analyses demonstrated the link between gene signatures and the biological basis of their predictive power. Different signatures for a given end point were more similar at the level of biological properties and transcriptional control than at the gene level. Signatures tended to be enriched in function and pathway in an end point and model-specific manner, and showed a topological bias for incoming interactions. Importantly, the level of biological similarity between different signatures for a given end point correlated positively with the accuracy of the signature predictions. These findings will aid the understanding, and application of predictive genomic signatures, and support their broader application in predictive medicine.

}, keywords = {Algorithms, Databases, Genetic, Endpoint Determination, Gene Expression Profiling, Genomics, Humans, Neural Networks, Computer, Oligonucleotide Array Sequence Analysis, Phenotype, Predictive Value of Tests, Proteins, Quality Control}, issn = {1473-1150}, doi = {10.1038/tpj.2010.35}, author = {Shi, W and Bessarabova, M and Dosymbekov, D and Dezso, Z and Nikolskaya, T and Dudoladova, M and Serebryiskaya, T and Bugrim, A and Guryanov, A and Brennan, R J and Shah, R and Dopazo, J and Chen, M and Deng, Y and Shi, T and Jurman, G and Furlanello, C and Thomas, R S and Corton, J C and Tong, W and Shi, L and Nikolsky, Y} } @article {552, title = {Functional genomics and networks: new approaches in the extraction of complex gene modules.}, journal = {Expert Rev Proteomics}, volume = {7}, year = {2010}, month = {2010 Feb}, pages = {55-63}, abstract = {

The engine that makes the cell work is made of an intricate network of molecular interactions. Nowadays, the elements and relationships of this complex network can be studied with several types of high-throughput techniques. The dream of having a global picture of the cell from different perspectives that can jointly explain cell behavior is, at least technically, feasible. However, this task can only be accomplished by filling the gap between data and information. The availability of methods capable of accurately managing, integrating and analyzing the results from these experiments is crucial for this purpose. Here, we review the new challenges raised by the availability of different genomic data, as well as the new proposals presented to cope with the increasing data complexity. Special emphasis is given to approaches that explore the transcriptome trying to describe the modules of genes that account for the traits studied.

}, keywords = {Gene Expression Regulation, Gene Regulatory Networks, Genomics, Protein Binding, Proteins, Systems biology}, issn = {1744-8387}, doi = {10.1586/epr.09.103}, author = {Minguez, Pablo and Dopazo, Joaquin} } @article {596, title = {Joint annotation of coding and non-coding single nucleotide polymorphisms and mutations in the SNPeffect and PupaSuite databases.}, journal = {Nucleic Acids Res}, volume = {36}, year = {2008}, month = {2008 Jan}, pages = {D825-9}, abstract = {

Single nucleotide polymorphisms (SNPs) are, together with copy number variation, the primary source of variation in the human genome. SNPs are associated with altered response to drug treatment, susceptibility to disease and other phenotypic variation. Furthermore, during genetic screens for disease-associated mutations in groups of patients and control individuals, the distinction between disease causing mutation and polymorphism is often unclear. Annotation of the functional and structural implications of single nucleotide changes thus provides valuable information to interpret and guide experiments. The SNPeffect and PupaSuite databases are now synchronized to deliver annotations for both non-coding and coding SNP, as well as annotations for the SwissProt set of human disease mutations. In addition, SNPeffect now contains predictions of Tango2: an improved aggregation detector, and Waltz: a novel predictor of amyloid-forming sequences, as well as improved predictors for regions that are recognized by the Hsp70 family of chaperones. The new PupaSuite version incorporates predictions for SNPs in silencers and miRNAs including their targets, as well as additional methods for predicting SNPs in TFBSs and splice sites. Also predictions for mouse and rat genomes have been added. In addition, a PupaSuite web service has been developed to enable data access, programmatically. The combined database holds annotations for 4,965,073 regulatory as well as 133,505 coding human SNPs and 14,935 disease mutations, and phenotypic descriptions of 43,797 human proteins and is accessible via http://snpeffect.vib.be and http://pupasuite.bioinfo.cipf.es/.

}, keywords = {Amino Acid Substitution, Animals, Databases, Genetic, Genetic Diseases, Inborn, HSP70 Heat-Shock Proteins, Humans, Internet, Mice, MicroRNAs, mutation, Polymorphism, Single Nucleotide, Proteins, Rats, RNA Splice Sites, Transcription Factors}, issn = {1362-4962}, doi = {10.1093/nar/gkm979}, author = {Reumers, Joke and Conde, Lucia and Medina, Ignacio and Maurer-Stroh, Sebastian and Van Durme, Joost and Dopazo, Joaquin and Rousseau, Frederic and Schymkowitz, Joost} } @article {598, title = {PhylomeDB: a database for genome-wide collections of gene phylogenies.}, journal = {Nucleic Acids Res}, volume = {36}, year = {2008}, month = {2008 Jan}, pages = {D491-6}, abstract = {

The complete collection of evolutionary histories of all genes in a genome, also known as phylome, constitutes a valuable source of information. The reconstruction of phylomes has been previously prevented by large demands of time and computer power, but is now feasible thanks to recent developments in computers and algorithms. To provide a publicly available repository of complete phylomes that allows researchers to access and store large-scale phylogenomic analyses, we have developed PhylomeDB. PhylomeDB is a database of complete phylomes derived for different genomes within a specific taxonomic range. All phylomes in the database are built using a high-quality phylogenetic pipeline that includes evolutionary model testing and alignment trimming phases. For each genome, PhylomeDB provides the alignments, phylogentic trees and tree-based orthology predictions for every single encoded protein. The current version of PhylomeDB includes the phylomes of Human, the yeast Saccharomyces cerevisiae and the bacterium Escherichia coli, comprising a total of 32 289 seed sequences with their corresponding alignments and 172 324 phylogenetic trees. PhylomeDB can be publicly accessed at http://phylomedb.bioinfo.cipf.es.

}, keywords = {Base Sequence, Escherichia coli, Genes, Genomics, History, Ancient, Humans, Phylogeny, Proteins, Saccharomyces cerevisiae, Sequence Alignment}, issn = {1362-4962}, doi = {10.1093/nar/gkm899}, author = {Huerta-Cepas, Jaime and Bueno, Anibal and Dopazo, Joaquin and Gabald{\'o}n, Toni} } @article {600, title = {Use of estimated evolutionary strength at the codon level improves the prediction of disease-related protein mutations in humans.}, journal = {Hum Mutat}, volume = {29}, year = {2008}, month = {2008 Jan}, pages = {198-204}, abstract = {

Predicting the functional impact of protein variation is one of the most challenging problems in bioinformatics. A rapidly growing number of genome-scale studies provide large amounts of experimental data, allowing the application of rigorous statistical approaches for predicting whether a given single point mutation has an impact on human health. Up until now, existing methods have limited their source data to either protein or gene information. Novel in this work, we take advantage of both and focus on protein evolutionary information by using estimated selective pressures at the codon level. Here we introduce a new method (SeqProfCod) to predict the likelihood that a given protein variant is associated with human disease or not. Our method relies on a support vector machine (SVM) classifier trained using three sources of information: protein sequence, multiple protein sequence alignments, and the estimation of selective pressure at the codon level. SeqProfCod has been benchmarked with a large dataset of 8,987 single point mutations from 1,434 human proteins from SWISS-PROT. It achieves 82\% overall accuracy and a correlation coefficient of 0.59, indicating that the estimation of the selective pressure helps in predicting the functional impact of single-point mutations. Moreover, this study demonstrates the synergic effect of combining two sources of information for predicting the functional effects of protein variants: protein sequence/profile-based information and the evolutionary estimation of the selective pressures at the codon level. The results of large-scale application of SeqProfCod over all annotated point mutations in SWISS-PROT (available for download at http://sgu.bioinfo.cipf.es/services/Omidios/; last accessed: 24 August 2007), could be used to support clinical studies.

}, keywords = {Algorithms, Codon, Computational Biology, Databases, Protein, DNA Mutational Analysis, Evolution, Molecular, Genetic Predisposition to Disease, Genetic Variation, Genome, Human, Humans, Iduronic Acid, Point Mutation, Polymorphism, Single Nucleotide, Proteins, Tumor Suppressor Protein p53}, issn = {1098-1004}, doi = {10.1002/humu.20628}, author = {Capriotti, Emidio and Arbiza, Leonardo and Casadio, Rita and Dopazo, Joaquin and Dopazo, Hern{\'a}n and Marti-Renom, Marc A} } @article {603, title = {DBAli tools: mining the protein structure space.}, journal = {Nucleic Acids Res}, volume = {35}, year = {2007}, month = {2007 Jul}, pages = {W393-7}, abstract = {

The DBAli tools use a comprehensive set of structural alignments in the DBAli database to leverage the structural information deposited in the Protein Data Bank (PDB). These tools include (i) the DBAlit program that allows users to input the 3D coordinates of a protein structure for comparison by MAMMOTH against all chains in the PDB; (ii) the AnnoLite and AnnoLyze programs that annotate a target structure based on its stored relationships to other structures; (iii) the ModClus program that clusters structures by sequence and structure similarities; (iv) the ModDom program that identifies domains as recurrent structural fragments and (v) an implementation of the COMPARER method in the SALIGN command in MODELLER that creates a multiple structure alignment for a set of related protein structures. Thus, the DBAli tools, which are freely accessible via the World Wide Web at http://salilab.org/DBAli/, allow users to mine the protein structure space by establishing relationships between protein structures and their functions.

}, keywords = {Algorithms, Amino Acid Sequence, Computational Biology, Data Interpretation, Statistical, Databases, Protein, Internet, Molecular Sequence Data, Protein Conformation, Proteins, Pseudomonas aeruginosa, Sequence Alignment, Sequence Analysis, Protein, Sequence Homology, Amino Acid, Software, Structure-Activity Relationship}, issn = {1362-4962}, doi = {10.1093/nar/gkm236}, author = {Marti-Renom, Marc A and Pieper, Ursula and Madhusudhan, M S and Rossi, Andrea and Eswar, Narayanan and Davis, Fred P and Al-Shahrour, F{\'a}tima and Dopazo, Joaquin and Sali, Andrej} } @article {592, title = {Functional profiling of microarray experiments using text-mining derived bioentities.}, journal = {Bioinformatics}, volume = {23}, year = {2007}, month = {2007 Nov 15}, pages = {3098-9}, abstract = {

MOTIVATION: The increasing use of microarray technologies brought about a parallel demand in methods for the functional interpretation of the results. Beyond the conventional functional annotations for genes, such as gene ontology, pathways, etc. other sources of information are still to be exploited. Text-mining methods allow extracting informative terms (bioentities) with different functional, chemical, clinical, etc. meanings, that can be associated to genes. We show how to use these associations within an appropriate statistical framework and how to apply them through easy-to-use, web-based environments to the functional interpretation of microarray experiments. Functional enrichment and gene set enrichment tests using bioentities are presented.

}, keywords = {Artificial Intelligence, Databases, Protein, Gene Expression Profiling, Information Storage and Retrieval, Natural Language Processing, Proteins, Research Design, Systems Integration}, issn = {1367-4811}, doi = {10.1093/bioinformatics/btm445}, author = {Minguez, Pablo and Al-Shahrour, F{\'a}tima and Montaner, David and Dopazo, Joaquin} }