zettelkasten

https://github.com/yrahul3910/zettelkasten
Science Score: 18.0%

This score indicates how likely this project is to be science-related based on various indicators:
✓
CITATION.cff file
Found CITATION.cff file
○
codemeta.json file
○
.zenodo.json file
○
DOI references
○
Academic links in README
○
Academic email domains
○
Institutional organization owner
○
JOSS paper metadata
○
Scientific vocabulary similarity
Unable to calculate vocabulary similarity
Last synced: 10 months ago · JSON representation ·
Repository

Basic Info

Host: GitHub
Owner: yrahul3910
License: mit
Language: TeX
Default Branch: master
Size: 17.1 MB
Statistics

Stars: 0
Watchers: 2
Forks: 0
Open Issues: 0
Releases: 0
Created over 2 years ago · Last pushed over 2 years ago
Metadata Files

License Citation
Owner

Name: Rahul Yedida
Login: yrahul3910
Kind: user
Location: Raleigh, NC, USA
Company: LexisNexis
Website: https://ryedida.me
Repositories: 155
Profile: https://github.com/yrahul3910
CS PhD @ NC State University
Citation (citations.bib)

@article{2019arXiv190700481B,
  title = {Spectral {{Clustering}} with {{Graph Neural Networks}} for {{Graph Pooling}}},
  author = {Bianchi, Filippo Maria and Grattarola, Daniele and Alippi, Cesare},
  date = {2019-06},
  journaltitle = {arXiv e-prints},
  eprint = {1907.00481},
  eprinttype = {arxiv},
  pages = {arXiv:1907.00481},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning}
}

@unpublished{Aakur,
  title = {A {{Perceptual Prediction Framework}} for {{Self Supervised Event Segmentation}}},
  author = {Aakur, Sathyanarayanan N and Sarkar, Sudeep},
  eprint = {1811.04869v2},
  eprinttype = {arxiv},
  file = {/Users/ryedida/Zotero/storage/2K9JRZAJ/Aakur, Sarkar - Unknown - A Perceptual Prediction Framework for Self Supervised Event Segmentation(2).pdf}
}

@article{aanhetrotSafetyEfficacyRepeatedDose2010a,
  title = {Safety and {{Efficacy}} of {{Repeated-Dose Intravenous Ketamine}} for {{Treatment-Resistant Depression}}},
  author = {Aan Het Rot, Marije and Collins, Katherine A. and Murrough, James W. and Perez, Andrew M. and Reich, David L. and Charney, Dennis S. and Mathew, Sanjay J.},
  date = {2010-01},
  journaltitle = {Biological Psychiatry},
  shortjournal = {Biological Psychiatry},
  volume = {67},
  number = {2},
  pages = {139--145},
  issn = {00063223},
  doi = {10.1016/j.biopsych.2009.08.038},
  url = {https://linkinghub.elsevier.com/retrieve/pii/S000632230901097X},
  urldate = {2023-09-21},
  abstract = {Background: A single subanesthetic (intravenous) IV dose of ketamine might have rapid but transient antidepressant effects in patients with treatment-resistant depression (TRD). Here we tested the tolerability, safety, and efficacy of repeated-dose open-label IV ketamine (six infusions over 12 days) in 10 medication-free symptomatic patients with TRD who had previously shown a meaningful antidepressant response to a single dose. Methods: On day 1, patients received a 40-min IV infusion of ketamine (.5 mg/kg) in an inpatient setting with continuous vital-sign monitoring. Psychotomimetic effects and adverse events were recorded repeatedly. The primary efficacy measure was change from baseline in the Montgomery-Åsberg Depression Rating Scale (MADRS) score. If patients showed a Ն50\% reduction in MADRS scores on day 2, they received five additional infusions on an outpatient basis (days 3, 5, 8, 10, and 12). Follow-up visits were conducted twice-weekly for Ն4 weeks or until relapse. Results: Ketamine elicited minimal positive psychotic symptoms. Three patients experienced significant but transient dissociative symptoms. Side effects during and after each ketamine infusion were generally mild. The response criterion was met by nine patients after the first infusion as well as after the sixth infusion. The mean (SD) reduction in MADRS scores after the sixth infusion was 85\% (12\%). Postketamine, eight of nine patients relapsed, on average, 19 days after the sixth infusion (range 6 days– 45 days). One patient remained antidepressantfree with minimal depressive symptoms for Ͼ3 months. Conclusions: These pilot findings suggest feasibility of repeated-dose IV ketamine for the acute treatment of TRD.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/QIH3F78Y/Aan Het Rot et al. - 2010 - Safety and Efficacy of Repeated-Dose Intravenous K.pdf}
}

@article{Abadi2016,
  title = {The Beckman Report on Database Research},
  author = {Abadi, Daniel and Agrawal, Rakesh and Ailamaki, Anastasia and Balazinska, Magdalena and Bernstein, Philip A. and Carey, Michael J. and Chaudhuri, Surajit and Dean, Jeffrey and Doan, Anhai and Franklin, Michael J. and Gehrke, Johannes and Haas, Laura M. and Halevy, Alon Y. and Hellerstein, Joseph M. and Ioannidis, Yannis E. and Jagadish, H. V. and Kossmann, Donald and Madden, Samuel and Mehrotra, Sharad and Milo, Tova and Naughton, Jeffrey F. and Ramakrishnan, Raghu and Markl, Volker and Olston, Christopher and Ooi, Beng Chin and Re, Christopher and Suciu, Dan and Stonebraker, Michael and Walter, Todd and Widom, Jennifer},
  date = {2016},
  journaltitle = {Communications of the ACM},
  volume = {59},
  number = {2},
  pages = {92--99},
  issn = {15577317},
  doi = {10.1145/2845915},
  abstract = {A GROUP OF database researchers meets periodically to discuss the state of the field and its key directions going forward. Past meetings were held in 1989,6 1990,11 1995,12 1996,10 1998,7 2003,1 and 2008.2 Continuing this tradition, 28 database researchers and two invited speakers met in October 2013 at the Beckman Center on the University of California-Irvine campus for two days of discussions. The meeting attendees represented a broad cross-section of interests, affiliations, seniority, and geography. Attendance was capped at 30 so the meeting would be as interactive as possible. This article summarizes the conclusions from that meeting; an extended report and participant presentations are available at http://beckman.cs.wisc.edu.},
  file = {/Users/ryedida/Zotero/storage/8GKNKZ6K/Abadi et al. - 2016 - The beckman report on database research(2).pdf}
}

@article{abramsModelBalancingCooperation2012,
  title = {A Model Balancing Cooperation and Competition Can Explain Our Right-Handed World and the Dominance of Left-Handed Athletes},
  author = {Abrams, Daniel M. and Panaggio, Mark J.},
  date = {2012-04-25},
  journaltitle = {Journal of The Royal Society Interface},
  volume = {9},
  number = {75},
  pages = {2718--2722},
  publisher = {{Royal Society}},
  doi = {10.1098/rsif.2012.0211},
  url = {https://royalsocietypublishing.org/doi/full/10.1098/rsif.2012.0211},
  urldate = {2023-12-09},
  abstract = {An overwhelming majority of humans are right-handed. Numerous explanations for individual handedness have been proposed, but this population-level handedness remains puzzling. Here, we present a novel mathematical model and use it to test the idea that population-level hand preference represents a balance between selective costs and benefits arising from cooperation and competition in human evolutionary history. We use the selection of elite athletes as a test-bed for our evolutionary model and find evidence for the validity of this idea. Our model gives the first quantitative explanation for the distribution of handedness both across and within many professional sports. It also predicts strong lateralization of hand use in social species with limited combative interaction, and elucidates the absence of consistent population-level ‘pawedness’ in some animal species.},
  keywords = {evolution,handedness,laterality,mathematical model},
  file = {/Users/ryedida/Zotero/storage/2GPUL9KP/Abrams_Panaggio_2012_A model balancing cooperation and competition can explain our right-handed.pdf}
}

@online{adamMaintainingStabilityPlasticity2023,
  title = {Maintaining {{Stability}} and {{Plasticity}} for {{Predictive Churn Reduction}}},
  author = {Adam, George and Haibe-Kains, Benjamin and Goldenberg, Anna},
  date = {2023-05-06},
  eprint = {2305.04135},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2305.04135},
  urldate = {2023-12-09},
  abstract = {Deployed machine learning models should be updated to take advantage of a larger sample size to improve performance, as more data is gathered over time. Unfortunately, even when model updates improve aggregate metrics such as accuracy, they can lead to errors on samples that were correctly predicted by the previous model causing per-sample regression in performance known as predictive churn. Such prediction flips erode user trust thereby reducing the effectiveness of the human-AI team as a whole. We propose a solution called Accumulated Model Combination (AMC) based keeping the previous and current model version, and generating a meta-output using the prediction of the two models. AMC is a general technique and we propose several instances of it, each having their own advantages depending on the model and data properties. AMC requires minimal additional computation and changes to training procedures. We motivate the need for AMC by showing the difficulty of making a single model consistent with its own predictions throughout training thereby revealing an implicit stability-plasticity tradeoff when training a single model. We demonstrate the effectiveness of AMC on a variety of modalities including computer vision, text, and tabular datasets comparing against state-ofthe-art churn reduction methods, and showing superior churn reduction ability compared to all existing methods while being more efficient than ensembles.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/NLQX6EJQ/Adam et al. - 2023 - Maintaining Stability and Plasticity for Predictiv.pdf}
}

@online{adamMaintainingStabilityPlasticity2023a,
  title = {Maintaining {{Stability}} and {{Plasticity}} for {{Predictive Churn Reduction}}},
  author = {Adam, George and Haibe-Kains, Benjamin and Goldenberg, Anna},
  date = {2023-05-06},
  eprint = {2305.04135},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2305.04135},
  urldate = {2024-01-22},
  abstract = {Deployed machine learning models should be updated to take advantage of a larger sample size to improve performance, as more data is gathered over time. Unfortunately, even when model updates improve aggregate metrics such as accuracy, they can lead to errors on samples that were correctly predicted by the previous model causing per-sample regression in performance known as predictive churn. Such prediction flips erode user trust thereby reducing the effectiveness of the human-AI team as a whole. We propose a solution called Accumulated Model Combination (AMC) based keeping the previous and current model version, and generating a meta-output using the prediction of the two models. AMC is a general technique and we propose several instances of it, each having their own advantages depending on the model and data properties. AMC requires minimal additional computation and changes to training procedures. We motivate the need for AMC by showing the difficulty of making a single model consistent with its own predictions throughout training thereby revealing an implicit stability-plasticity tradeoff when training a single model. We demonstrate the effectiveness of AMC on a variety of modalities including computer vision, text, and tabular datasets comparing against state-ofthe-art churn reduction methods, and showing superior churn reduction ability compared to all existing methods while being more efficient than ensembles.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/NIDKYIYL/Adam et al. - 2023 - Maintaining Stability and Plasticity for Predictiv.pdf}
}

@inproceedings{aggarwal2001surprising,
  title = {On the Surprising Behavior of Distance Metrics in High Dimensional Space},
  booktitle = {Database {{Theory}}—{{ICDT}} 2001: 8th International Conference London, {{UK}}, January 4–6, 2001 Proceedings 8},
  author = {Aggarwal, Charu C and Hinneburg, Alexander and Keim, Daniel A},
  date = {2001},
  pages = {420--434},
  publisher = {{Springer}}
}

@article{AGRAWAL18ist,
  title = {What Is Wrong with Topic Modeling? {{And}} How to Fix It Using Search-Based Software Engineering},
  author = {Agrawal, Amritanshu and Fu, Wei and Menzies, Tim},
  date = {2018},
  journaltitle = {Information and Software Technology},
  volume = {98},
  pages = {74--88},
  issn = {0950-5849},
  doi = {10.1016/j.infsof.2018.02.005},
  url = {https://www.sciencedirect.com/science/article/pii/S0950584917300861},
  keywords = {Differential evolution,LDA,Stability,Topic modeling,Tuning}
}

@article{agrawal2019dodge,
  title = {How to" {{DODGE}}" {{Complex Software Analytics}}},
  author = {Agrawal, Amritanshu and Fu, Wei and Chen, Di and Shen, Xipeng and Menzies, Tim},
  date = {2019},
  journaltitle = {IEEE Transactions on Software Engineering},
  publisher = {{IEEE}},
  file = {/Users/ryedida/Zotero/storage/CTJ8MX77/Agrawal et al_2019_How to DODGE Complex Software Analytics.pdf}
}

@article{agrawal2020better,
  title = {Better Software Analytics via “{{DUO}}”: {{Data}} Mining Algorithms Using/Used-by Optimizers},
  author = {Agrawal, Amritanshu and Menzies, Tim and Minku, Leandro L and Wagner, Markus and Yu, Zhe},
  date = {2020},
  journaltitle = {Empirical Software Engineering},
  volume = {25},
  pages = {2099--2136},
  publisher = {{Springer}}
}

@article{agrawal2021simpler,
  title = {Simpler Hyperparameter Optimization for Software Analytics: {{Why}}, How, When?},
  author = {Agrawal, Amritanshu and Yang, Xueqi and Agrawal, Rishabh and Yedida, Rahul and Shen, Xipeng and Menzies, Tim},
  date = {2021},
  journaltitle = {IEEE Transactions on Software Engineering},
  volume = {48},
  number = {8},
  pages = {2939--2954},
  publisher = {{IEEE}}
}

@inproceedings{agrawalBetterDataBetter2018,
  title = {Is "Better Data" Better than "Better Data Miners"?: On the Benefits of Tuning {{SMOTE}} for Defect Prediction},
  shorttitle = {Is "Better Data" Better than "Better Data Miners"?},
  booktitle = {Proceedings of the 40th {{International Conference}} on {{Software Engineering}}},
  author = {Agrawal, Amritanshu and Menzies, Tim},
  date = {2018-05-27},
  pages = {1050--1061},
  publisher = {{ACM}},
  location = {{Gothenburg Sweden}},
  doi = {10.1145/3180155.3180197},
  url = {https://dl.acm.org/doi/10.1145/3180155.3180197},
  urldate = {2024-01-07},
  abstract = {We report and fix an important systematic error in prior studies that ranked classifiers for software analytics. Those studies did not (a) assess classifiers on multiple criteria and they did not (b) study how variations in the data affect the results. Hence, this paper applies (a) multi-performance criteria while (b) fixing the weaker regions of the training data (using SMOTUNED, which is an autotuning version of SMOTE). This approach leads to dramatically large increases in software defect predictions when applied in a 5*5 cross-validation study for 3,681 JAVA classes (containing over a million lines of code) from open source systems, SMOTUNED increased AUC and recall by 60\% and 20\% respectively. These improvements are independent of the classifier used to predict for defects. Same kind of pattern (improvement) was observed when a comparative analysis of SMOTE and SMOTUNED was done against the most recent class imbalance technique.},
  eventtitle = {{{ICSE}} '18: 40th {{International Conference}} on {{Software Engineering}}},
  isbn = {978-1-4503-5638-1},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/T7F655HD/Agrawal and Menzies - 2018 - Is better data better than better data miners.pdf}
}

@article{aha91,
  title = {Instance-Based Learning Algorithms},
  author = {Aha, W. and Kibler, Dennis and Albert, Marc},
  date = {1991-01},
  journaltitle = {Machine Learning},
  volume = {6},
  pages = {37--66},
  doi = {10.1023/A:1022689900470}
}

@article{alonCODE2SEQGENERATINGSEQUENCES2019,
  title = {{{CODE2SEQ}}: {{GENERATING SEQUENCES FROM STRUCTURED REPRESENTATIONS OF CODE}}},
  author = {Alon, Uri and Levy, Omer and Brody, Shaked and Yahav, Eran},
  date = {2019},
  pages = {22},
  abstract = {The ability to generate natural language sequences from source code snippets has a variety of applications such as code summarization, documentation, and retrieval. Sequence-to-sequence (seq2seq) models, adopted from neural machine translation (NMT), have achieved state-of-the-art performance on these tasks by treating source code as a sequence of tokens. We present CODE2SEQ: an alternative approach that leverages the syntactic structure of programming languages to better encode source code. Our model represents a code snippet as the set of compositional paths in its abstract syntax tree (AST) and uses attention to select the relevant paths while decoding. We demonstrate the effectiveness of our approach for two tasks, two programming languages, and four datasets of up to 16M examples. Our model significantly outperforms previous models that were specifically designed for programming languages, as well as state-of-the-art NMT models. An online demo of our model is available at http://code2seq.org. Our code, data and trained models are available at http://github.com/tech-srl/code2seq.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/U4HB2ACG/Alon et al. - 2019 - CODE2SEQ GENERATING SEQUENCES FROM STRUCTURED REP.pdf}
}

@thesis{ammarMultiheuristicTheoryAssessment,
  type = {M.S.E.E.},
  title = {Multi-Heuristic Theory Assessment with Iterative Selection},
  author = {Ammar, Kareem},
  institution = {{West Virginia University}},
  location = {{United States -- West Virginia}},
  url = {https://www.proquest.com/docview/305108489/abstract/3D81689A37924677PQ/1},
  urldate = {2023-12-11},
  abstract = {Modern day machine learning is not without its shortcomings. To start with, the heuristic accuracy, which is the standard assessment criteria for machine learning, is not always the best heuristic to gauge the performance of machine learners. Also machine learners many times produce theories that are unintelligible by people and must be assessed as automated classifiers through machines. Theses theories are either too large or not properly formatted for human interpretation. Furthermore, our studies have identified that most of the data sets we have encountered are satiated with worthless data that actually leads to the degradation of the accuracy of machine learners. Therefore, simpler learning is more optimal. This necessitates a simpler classifier that is not confused with highly correlated data. Lastly, existing machine learners are not sensitive to domains. That is, they are not tunable to search for theories that are most beneficial to specific domains.},
  isbn = {9780496921218},
  langid = {english},
  pagetotal = {114},
  keywords = {Applied sciences},
  file = {/Users/ryedida/Zotero/storage/9WAAPXSJ/Ammar_Multi-heuristic theory assessment with iterative selection.pdf}
}

@unpublished{andriushchenkoSquareAttackQueryefficient2020,
  title = {Square {{Attack}}: A Query-Efficient Black-Box Adversarial Attack via Random Search},
  shorttitle = {Square {{Attack}}},
  author = {Andriushchenko, Maksym and Croce, Francesco and Flammarion, Nicolas and Hein, Matthias},
  date = {2020-07-29},
  eprint = {1912.00049},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1912.00049},
  urldate = {2021-03-27},
  abstract = {We propose the Square Attack, a score-based black-box l2and l∞-adversarial attack that does not rely on local gradient information and thus is not affected by gradient masking. Square Attack is based on a randomized search scheme which selects localized squareshaped updates at random positions so that at each iteration the perturbation is situated approximately at the boundary of the feasible set. Our method is significantly more query efficient and achieves a higher success rate compared to the state-of-the-art methods, especially in the untargeted setting. In particular, on ImageNet we improve the average query efficiency in the untargeted setting for various deep networks by a factor of at least 1.8 and up to 3 compared to the recent state-ofthe-art l∞-attack of Al-Dujaili \& OReilly (2020). Moreover, although our attack is black-box, it can also outperform gradient-based white-box attacks on the standard benchmarks achieving a new state-of-the-art in terms of the success rate. The code of our attack is available at https://github.com/max-andr/square-attack.},
  langid = {english},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Cryptography and Security,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/P8QAX52B/Andriushchenko et al. - 2020 - Square Attack a query-efficient black-box adversa.pdf}
}

@article{Andujar2017,
  title = {A User-Centered Approach towards Attention Visualization for Learning Activities},
  author = {Andujar, Marvin and Gilbert, Juan E},
  date = {2017},
  journaltitle = {UbiComp/ISWC Adjunct},
  pages = {871--876},
  doi = {10.1145/3123024.3125505},
  abstract = {© 2017 Association for Computing Machinery. Technology is changing the way that students learn. Many students learn the content of different subjects outside the classroom by watching videos on YouTube, Vimeo, Khan Academy, and other sources. Nevertheless, exams are currently the best feedback indication of how effective the student's study session was. Physiological sensors such as electroencephalography (EEG) are accurate enough for measuring attention levels for feedback during self-learning. The traditional representation of brain data is complex for everyday users to comprehend; therefore, such graphs need to be represented in the form of static visualizations for simplicity. In this paper, we describe visualizations formulated by college students in focus groups sessions to visualize attention data for feedback after completing a learning task. These visualizations help us understand user's preference and perception for representing attention, which can be implemented in future quantified-self applications and dashboards.},
  isbn = {978-1-4503-5190-4},
  file = {/Users/ryedida/Zotero/storage/NHPIWKPF/Andujar, Gilbert - 2017 - A user-centered approach towards attention visualization for learning activities(2).pdf}
}

@article{Andujar2018,
  title = {Effectiveness of the {{Alpha Calibration}} with a {{Brain-Computer Interface}} for {{College Students}}},
  author = {Andujar, Marvin and Caprio, Derek},
  date = {2018},
  journaltitle = {Proceedings of the 2018 \{ACM\} International Joint Conference and 2018 International Symposium on Pervasive and Ubiquitous Computing and Wearable Computers, UbiComp/ISWC 2018 Adjunct, Singapore, October 08-12, 2018},
  pages = {960--963},
  doi = {10.1145/3267305.3267690},
  url = {https://doi.org/10.1145/3267305.3267690},
  isbn = {9781450359665},
  file = {/Users/ryedida/Zotero/storage/CZSZNCFE/Andujar, Caprio - 2018 - Effectiveness of the Alpha Calibration with a Brain-Computer Interface for College Students(2).pdf}
}

@inproceedings{ansel:pact:2014,
  title = {{{OpenTuner}}: {{An}} Extensible Framework for Program Autotuning},
  booktitle = {International Conference on Parallel Architectures and Compilation Techniques ({{PACT}})},
  author = {Ansel, Jason and Kamil, Shoaib and Veeramachaneni, Kalyan and Ragan-Kelley, Jonathan and Bosboom, Jeffrey and O'Reilly, Una-May and Amarasinghe, Saman},
  date = {2014-08},
  location = {{Edmonton, Canada}},
  url = {http://groups.csail.mit.edu/commit/papers/2014/ansel-pact14-opentuner.pdf},
  keywords = {OpenTuner}
}

@inproceedings{anselOpenTunerExtensibleFramework2014,
  title = {{{OpenTuner}}: An Extensible Framework for Program Autotuning},
  shorttitle = {{{OpenTuner}}},
  booktitle = {Proceedings of the 23rd International Conference on {{Parallel}} Architectures and Compilation},
  author = {Ansel, Jason and Kamil, Shoaib and Veeramachaneni, Kalyan and Ragan-Kelley, Jonathan and Bosboom, Jeffrey and O'Reilly, Una-May and Amarasinghe, Saman},
  date = {2014-08-24},
  pages = {303--316},
  publisher = {{ACM}},
  location = {{Edmonton AB Canada}},
  doi = {10.1145/2628071.2628092},
  url = {https://dl.acm.org/doi/10.1145/2628071.2628092},
  urldate = {2024-01-15},
  abstract = {Program autotuning has been shown to achieve better or more portable performance in a number of domains. However, autotuners themselves are rarely portable between projects, for a number of reasons: using a domain-informed search space representation is critical to achieving good results; search spaces can be intractably large and require advanced machine learning techniques; and the landscape of search spaces can vary greatly between different problems, sometimes requiring domain specific search techniques to explore efficiently.},
  eventtitle = {{{PACT}} '14: {{International Conference}} on {{Parallel Architectures}} and {{Compilation}}},
  isbn = {978-1-4503-2809-8},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/5EJAEUFY/Ansel et al. - 2014 - OpenTuner an extensible framework for program aut.pdf}
}

@online{AntidepressantEfficacyKetamine,
  title = {Antidepressant {{Efficacy}} of {{Ketamine}} in {{Treatment-Resistant Major Depression}}: {{A Two-Site Randomized Controlled Trial}}},
  shorttitle = {Antidepressant {{Efficacy}} of {{Ketamine}} in {{Treatment-Resistant Major Depression}}},
  doi = {10.1176/appi.ajp.2013.13030392},
  url = {https://ajp.psychiatryonline.org/doi/epdf/10.1176/appi.ajp.2013.13030392},
  urldate = {2023-09-21},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/KENT729Q/Antidepressant Efficacy of Ketamine in Treatment-Resistant Major Depression.pdf;/Users/ryedida/Zotero/storage/5JNFLUGS/appi.ajp.2013.html}
}

@online{arangoHPOBLargeScaleReproducible2021,
  title = {{{HPO-B}}: {{A Large-Scale Reproducible Benchmark}} for {{Black-Box HPO}} Based on {{OpenML}}},
  shorttitle = {{{HPO-B}}},
  author = {Arango, Sebastian Pineda and Jomaa, Hadi S. and Wistuba, Martin and Grabocka, Josif},
  date = {2021-10-11},
  eprint = {2106.06257},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2106.06257},
  urldate = {2023-12-12},
  abstract = {Hyperparameter optimization (HPO) is a core problem for the machine learning community and remains largely unsolved due to the significant computational resources required to evaluate hyperparameter configurations. As a result, a series of recent related works have focused on the direction of transfer learning for quickly fine-tuning hyperparameters on a dataset. Unfortunately, the community does not have a common large-scale benchmark for comparing HPO algorithms. Instead, the de facto practice consists of empirical protocols on arbitrary small-scale meta-datasets that vary inconsistently across publications, making reproducibility a challenge. To resolve this major bottleneck and enable a fair and fast comparison of black-box HPO methods on a level playing field, we propose HPO-B, a new large-scale benchmark in the form of a collection of meta-datasets. Our benchmark is assembled and preprocessed from the OpenML repository and consists of 176 search spaces (algorithms) evaluated sparsely on 196 datasets with a total of 6.4 million hyperparameter evaluations. For ensuring reproducibility on our benchmark, we detail explicit experimental protocols, splits, and evaluation measures for comparing methods for both non-transfer, as well as, transfer learning HPO.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/53XP7TVF/Arango et al. - 2021 - HPO-B A Large-Scale Reproducible Benchmark for Bl.pdf}
}

@inproceedings{arcuriPracticalGuideUsing2011,
  title = {A Practical Guide for Using Statistical Tests to Assess Randomized Algorithms in Software Engineering},
  booktitle = {Proceedings of the 33rd {{International Conference}} on {{Software Engineering}}},
  author = {Arcuri, Andrea and Briand, Lionel},
  date = {2011-05-21},
  pages = {1--10},
  publisher = {{ACM}},
  location = {{Waikiki, Honolulu HI USA}},
  doi = {10.1145/1985793.1985795},
  url = {https://dl.acm.org/doi/10.1145/1985793.1985795},
  urldate = {2023-12-09},
  abstract = {Randomized algorithms have been used to successfully address many different types of software engineering problems. This type of algorithms employ a degree of randomness as part of their logic. Randomized algorithms are useful for difficult problems where a precise solution cannot be derived in a deterministic way within reasonable time. However, randomized algorithms produce different results on every run when applied to the same problem instance. It is hence important to assess the effectiveness of randomized algorithms by collecting data from a large enough number of runs. The use of rigorous statistical tests is then essential to provide support to the conclusions derived by analyzing such data. In this paper, we provide a systematic review of the use of randomized algorithms in selected software engineering venues in 2009. Its goal is not to perform a complete survey but to get a representative snapshot of current practice in software engineering research. We show that randomized algorithms are used in a significant percentage of papers but that, in most cases, randomness is not properly accounted for. This casts doubts on the validity of most empirical results assessing randomized algorithms. There are numerous statistical tests, based on different assumptions, and it is not always clear when and how to use these tests. We hence provide practical guidelines to support empirical research on randomized algorithms in software engineering.},
  eventtitle = {{{ICSE11}}: {{International Conference}} on {{Software Engineering}}},
  isbn = {978-1-4503-0445-0},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/AGZ79G22/Arcuri and Briand - 2011 - A practical guide for using statistical tests to a.pdf}
}

@unpublished{aroraExactComputationInfinitely2019,
  title = {On {{Exact Computation}} with an {{Infinitely Wide Neural Net}}},
  author = {Arora, Sanjeev and Du, Simon S. and Hu, Wei and Li, Zhiyuan and Salakhutdinov, Ruslan and Wang, Ruosong},
  date = {2019-11-04},
  eprint = {1904.11955},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1904.11955},
  urldate = {2022-01-01},
  abstract = {How well does a classic deep net architecture like AlexNet or VGG19 classify on a standard dataset such as CIFAR-10 when its “width”— namely, number of channels in convolutional layers, and number of nodes in fully-connected internal layers — is allowed to increase to infinity? Such questions have come to the forefront in the quest to theoretically understand deep learning and its mysteries about optimization and generalization. They also connect deep learning to notions such as Gaussian processes and kernels. A recent paper [Jacot et al., 2018] introduced the Neural Tangent Kernel (NTK) which captures the behavior of fully-connected deep nets in the infinite width limit trained by gradient descent; this object was implicit in some other recent papers. An attraction of such ideas is that a pure kernel-based method is used to capture the power of a fully-trained deep net of infinite width.},
  langid = {english},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning},
  annotation = {416 citations (Semantic Scholar/arXiv) [2022-01-01]},
  file = {/Users/ryedida/Zotero/storage/YAHIA9B4/Arora et al. - 2019 - On Exact Computation with an Infinitely Wide Neura.pdf}
}

@article{arunGeneticAlgorithmbasedOversampling2021,
  title = {Genetic Algorithm-Based Oversampling Approach to Prune the Class Imbalance Issue in Software Defect Prediction},
  author = {Arun, C. and Lakshmi, C.},
  date = {2021-08-29},
  journaltitle = {Soft Computing},
  shortjournal = {Soft Comput},
  issn = {1432-7643, 1433-7479},
  doi = {10.1007/s00500-021-06112-6},
  url = {https://link.springer.com/10.1007/s00500-021-06112-6},
  urldate = {2022-01-07},
  abstract = {Class imbalance is the potential problem that has been existent in machine learning, which hinders the performance of the classification algorithm when applied in real-world applications such as electricity pilferage, fraudulent transactions, anomaly detection, and prediction of rare diseases. Class imbalance refers to the problem where the distribution of the sample is skewed or biased toward one particular class. Due to its intrinsic nature the software fault prediction dataset falls into the same category where the software modules contain fewer defective modules compared to the non-defective modules. The majority of the oversampling techniques that has been proposed is to address the issue by generating synthetic samples of minority class to balance the dataset. But the synthetic samples generated are near duplicates that also results in over-generalization issue. We thus propose a novel oversampling approach to introduce synthetic samples using genetic algorithm (GA). GA is a form of evolutionary algorithm that employs biologically inspired techniques such as inheritance, mutation, selection, and crossover. The proposed algorithm generates synthetic sample of minority class based on the distribution measure and ensures that the samples are diverse within the class and are efficient. The proposed oversampling algorithm has been compared with SMOTE, BSMOTE, ADASYN, random oversampling, MAHAKIL, and no sampling approach with 20 defect prediction datasets from the promise repository and five prediction models. The results indicate that the genetic algorithm oversampling approach improves the fault prediction performance and reduced false alarm rate.},
  langid = {english},
  annotation = {0 citations (Semantic Scholar/DOI) [2022-01-07]},
  file = {/Users/ryedida/Zotero/storage/MDYBQHV9/Arun and Lakshmi - 2021 - Genetic algorithm-based oversampling approach to p.pdf}
}

@article{Athalye2018,
  title = {Obfuscated Gradients Give a False Sense of Security: {{Circumventing}} Defenses to Adversarial Examples},
  author = {Athalye, Anish and Carlini, Nicholas and Wagner, David},
  date = {2018},
  journaltitle = {35th International Conference on Machine Learning, ICML 2018},
  volume = {1},
  eprint = {1802.00420},
  eprinttype = {arxiv},
  pages = {436--448},
  abstract = {We identify obfuscated gradients, a kind of gradient masking, as a phenomenon that leads to a false sense of security in defenses against adversarial examples. While defenses that causc obfuscated gradients appear to defeat iterative optimization- based attacks, wc find defenses relying on this effect can be circumvented. We describe characteristic behaviors of defenses exhibiting the effect, and for each of the three types Qf obfuscated gradients we discover, wc develop attack techniques to overcome it. In a case study, examining non- certified white-box-secure defenses at ICLR 2018. we find obfuscated gradients arc a common occurrence, with 7 of 9 defenses relying on obfuscated gradients. Our new attacks successfully circumvent 6 completely, and 1 partially, in the original threat model each paper considers.},
  isbn = {9781510867963},
  file = {/Users/ryedida/Zotero/storage/I4FYI8N9/Athalye, Carlini, Wagner - 2018 - Obfuscated gradients give a false sense of security Circumventing defenses to adversarial examples(2).pdf}
}

@unpublished{Athalye2018a,
  title = {On the {{Robustness}} of the {{CVPR}} 2018 {{White-Box Adversarial Example Defenses}}},
  author = {Athalye, Anish and Carlini, Nicholas},
  date = {2018},
  eprint = {1804.03286},
  eprinttype = {arxiv},
  pages = {2--3},
  url = {http://arxiv.org/abs/1804.03286},
  abstract = {Neural networks are known to be vulnerable to adversarial examples. In this note, we evaluate the two white-box defenses that appeared at CVPR 2018 and find they are ineffective: when applying existing techniques, we can reduce the accuracy of the defended models to 0\%.},
  file = {/Users/ryedida/Zotero/storage/Z3FCV653/Athalye, Carlini - 2018 - On the Robustness of the CVPR 2018 White-Box Adversarial Example Defenses(2).pdf}
}

@article{attenbergBeatMachineChallenging2015,
  title = {Beat the {{Machine}}: {{Challenging Humans}} to {{Find}} a {{Predictive Model}}'s “{{Unknown Unknowns}}”},
  shorttitle = {Beat the {{Machine}}},
  author = {Attenberg, Joshua and Ipeirotis, Panos and Provost, Foster},
  date = {2015-03-04},
  journaltitle = {Journal of Data and Information Quality},
  shortjournal = {J. Data and Information Quality},
  volume = {6},
  number = {1},
  pages = {1--17},
  issn = {1936-1955, 1936-1963},
  doi = {10.1145/2700832},
  url = {https://dl.acm.org/doi/10.1145/2700832},
  urldate = {2023-12-09},
  abstract = {We present techniques for gathering data that expose errors of automatic predictive models. In certain common settings, traditional methods for evaluating predictive models tend to miss rare but important errors—most importantly, cases for which the model is confident of its prediction (but wrong). In this article, we present a system that, in a game-like setting, asks humans to identify cases that will cause the predictive model-based system to fail. Such techniques are valuable in discovering problematic cases that may not reveal themselves during the normal operation of the system and may include cases that are rare but catastrophic. We describe the design of the system, including design iterations that did not quite work. In particular, the system incentivizes humans to provide examples that are difficult for the model to handle by providing a reward proportional to the magnitude of the predictive model's error. The humans are asked to “Beat the Machine” and find cases where the automatic model (“the Machine”) is wrong. Experiments show that the humans using Beat the Machine identify more errors than do traditional techniques for discovering errors in predictive models, and, indeed, they identify many more errors where the machine is (wrongly) confident it is correct. Furthermore, those cases the humans identify seem to be not simply outliers, but coherent areas missed completely by the model. Beat the Machine identifies the “unknown unknowns.” Beat the Machine has been deployed at an industrial scale by several companies. The main impact has been that firms are changing their perspective on and practice of evaluating predictive models.                            “               There are known knowns. These are things we know that we know. There are known unknowns. That is to say, there are things that we know we don't know. But there are also unknown unknowns. There are things we don't know we don't know               .”                                         --               Donald Rumsfeld},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/YU657C64/Attenberg et al. - 2015 - Beat the Machine Challenging Humans to Find a Pre.pdf}
}

@article{aungMultitriageMultitaskLearning2022,
  title = {Multi-Triage: {{A}} Multi-Task Learning Framework for Bug Triage},
  shorttitle = {Multi-Triage},
  author = {Aung, Thazin Win Win and Wan, Yao and Huo, Huan and Sui, Yulei},
  date = {2022-02-01},
  journaltitle = {Journal of Systems and Software},
  shortjournal = {Journal of Systems and Software},
  volume = {184},
  pages = {111133},
  issn = {0164-1212},
  doi = {10.1016/j.jss.2021.111133},
  url = {https://www.sciencedirect.com/science/article/pii/S0164121221002302},
  urldate = {2022-01-19},
  abstract = {Assigning developers and allocating issue types are two important tasks in the bug triage process. Existing approaches tackle these two tasks separately, which is time-consuming due to repetition of effort and negating the values of correlated information between tasks. In this paper, a multi-triage model is proposed that resolves both tasks simultaneously via multi-task learning (MTL). First, both tasks can be regarded as a classification problem, based on historical issue reports. Second, performances on both tasks can be improved by jointly interpreting the representations of the issue report information. To do so, a text encoder and abstract syntax tree (AST) encoder are used to extract the feature representation of bug descriptions and code snippets accordingly. Finally, due to the disproportionate ratio of class labels in training datasets, the contextual data augmentation approach is introduced to generate syntactic issue reports to balance the class labels. Experiments were conducted on eleven open-source projects to demonstrate the effectiveness of this model compared with state-of-the-art methods.},
  langid = {english},
  keywords = {Bug triage,Deep learning,Multi-task learning,Recommendation system},
  annotation = {0 citations (Semantic Scholar/DOI) [2022-01-19]},
  file = {/Users/ryedida/Zotero/storage/DHSTX3PB/S0164121221002302.html}
}

@unpublished{Avati2018,
  title = {Countdown {{Regression}}: {{Sharp}} and {{Calibrated Survival Predictions}}},
  author = {Avati, Anand and Duan, Tony and Jung, Kenneth and Shah, Nigam H. and Ng, Andrew},
  date = {2018},
  eprint = {1806.08324},
  eprinttype = {arxiv},
  pages = {1--14},
  url = {http://arxiv.org/abs/1806.08324},
  abstract = {Personalized probabilistic forecasts of time to event (such as mortality) can be crucial in decision making, especially in the clinical setting. Inspired by ideas from the meteorology literature, we approach this problem through the paradigm of maximizing sharpness of prediction distributions, subject to calibration. In regression problems, it has been shown that optimizing the continuous ranked probability score (CRPS) instead of maximum likelihood leads to sharper prediction distributions while maintaining calibration. We introduce the Survival-CRPS, a generalization of the CRPS to the time to event setting, and present right-censored and interval-censored variants. To holistically evaluate the quality of predicted distributions over time to event, we present the Survival-AUPRC evaluation metric, an analog to area under the precision-recall curve. We apply these ideas by building a recurrent neural network for mortality prediction, using an Electronic Health Record dataset covering millions of patients. We demonstrate significant benefits in models trained by the Survival-CRPS objective instead of maximum likelihood.},
  file = {/Users/ryedida/Zotero/storage/VUXJX2H8/Avati et al. - 2018 - Countdown Regression Sharp and Calibrated Survival Predictions(2).pdf}
}

@unpublished{awadDEHBEvolutionaryHyperband2021,
  title = {{{DEHB}}: {{Evolutionary Hyperband}} for {{Scalable}}, {{Robust}} and {{Efficient Hyperparameter Optimization}}},
  shorttitle = {{{DEHB}}},
  author = {Awad, Noor and Mallik, Neeratyoy and Hutter, Frank},
  date = {2021-10-21},
  eprint = {2105.09821},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2105.09821},
  urldate = {2022-01-02},
  abstract = {Modern machine learning algorithms crucially rely on several design decisions to achieve strong performance, making the problem of Hyperparameter Optimization (HPO) more important than ever. Here, we combine the advantages of the popular bandit-based HPO method Hyperband (HB) and the evolutionary search approach of Differential Evolution (DE) to yield a new HPO method which we call DEHB. Comprehensive results on a very broad range of HPO problems, as well as a wide range of tabular benchmarks from neural architecture search, demonstrate that DEHB achieves strong performance far more robustly than all previous HPO methods we are aware of, especially for high-dimensional problems with discrete input dimensions. For example, DEHB is up to 1000× faster than random search. It is also efficient in computational time, conceptually simple and easy to implement, positioning it well to become a new default HPO method.},
  langid = {english},
  keywords = {Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing},
  annotation = {5 citations (Semantic Scholar/arXiv) [2022-01-02]},
  file = {/Users/ryedida/Zotero/storage/5SLK5TRZ/Awad et al. - 2021 - DEHB Evolutionary Hyperband for Scalable, Robust .pdf}
}

@article{B2017,
  title = {The {{Impact}} of {{Toxic Language}} on the {{Health}} of {{Reddit Communities}}},
  author = {B, Shruthi Mohan and Guha, Apala and Harris, Michael and Popowich, Fred and Schuster, Ashley and Priebe, Chris},
  date = {2017},
  eprint = {4128222},
  eprinttype = {pmid},
  pages = {51--56},
  issn = {0036-8075},
  url = {http://link.springer.com/10.1007/b136277},
  isbn = {978-3-540-25864-3}
}

@unpublished{Ba2016,
  title = {Layer {{Normalization}}},
  author = {Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E.},
  date = {2016},
  eprint = {1607.06450},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1607.06450},
  abstract = {Training state-of-the-art, deep neural networks is computationally expensive. One way to reduce the training time is to normalize the activities of the neurons. A recently introduced technique called batch normalization uses the distribution of the summed input to a neuron over a mini-batch of training cases to compute a mean and variance which are then used to normalize the summed input to that neuron on each training case. This significantly reduces the training time in feed-forward neural networks. However, the effect of batch normalization is dependent on the mini-batch size and it is not obvious how to apply it to recurrent neural networks. In this paper, we transpose batch normalization into layer normalization by computing the mean and variance used for normalization from all of the summed inputs to the neurons in a layer on a single training case. Like batch normalization, we also give each neuron its own adaptive bias and gain which are applied after the normalization but before the non-linearity. Unlike batch normalization, layer normalization performs exactly the same computation at training and test times. It is also straightforward to apply to recurrent neural networks by computing the normalization statistics separately at each time step. Layer normalization is very effective at stabilizing the hidden state dynamics in recurrent networks. Empirically, we show that layer normalization can substantially reduce the training time compared with previously published techniques.},
  file = {/Users/ryedida/Zotero/storage/E99QHPUH/Ba, Kiros, Hinton - 2016 - Layer Normalization(2).pdf}
}

@online{baekAccurateLearningGraph2021a,
  title = {Accurate {{Learning}} of {{Graph Representations}} with {{Graph Multiset Pooling}}},
  author = {Baek, Jinheon and Kang, Minki and Hwang, Sung Ju},
  date = {2021-06-28},
  eprint = {2102.11533},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2102.11533},
  urldate = {2023-12-09},
  abstract = {Graph neural networks have been widely used on modeling graph data, achieving impressive results on node classification and link prediction tasks. Yet, obtaining an accurate representation for a graph further requires a pooling function that maps a set of node representations into a compact form. A simple sum or average over all node representations considers all node features equally without consideration of their task relevance, and any structural dependencies among them. Recently proposed hierarchical graph pooling methods, on the other hand, may yield the same representation for two different graphs that are distinguished by the Weisfeiler-Lehman test, as they suboptimally preserve information from the node features. To tackle these limitations of existing graph pooling methods, we first formulate the graph pooling problem as a multiset encoding problem with auxiliary information about the graph structure, and propose a Graph Multiset Transformer (GMT) which is a multi-head attention based global pooling layer that captures the interaction between nodes according to their structural dependencies. We show that GMT satisfies both injectiveness and permutation invariance, such that it is at most as powerful as the Weisfeiler-Lehman graph isomorphism test. Moreover, our methods can be easily extended to the previous node clustering approaches for hierarchical graph pooling. Our experimental results show that GMT significantly outperforms state-of-the-art graph pooling methods on graph classification benchmarks with high memory and time efficiency, and obtains even larger performance gain on graph reconstruction and generation tasks.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/4DCCWUU3/Baek et al. - 2021 - Accurate Learning of Graph Representations with Gr.pdf}
}

@article{bafnaThwartingAdversarialExamples,
  title = {Thwarting {{Adversarial Examples}}: {{An}} \${{L}}\_0\$-{{Robust Sparse Fourier Transform}}},
  author = {Bafna, Mitali and Murtagh, Jack and Vyas, Nikhil},
  pages = {11},
  abstract = {We give a new algorithm for approximating the Discrete Fourier transform of an approximately sparse signal that has been corrupted by worst-case L0 noise, namely a bounded number of coordinates of the signal have been corrupted arbitrarily. Our techniques generalize to a wide range of linear transformations that are used in data analysis such as the Discrete Cosine and Sine transforms, the Hadamard transform, and their high-dimensional analogs. We use our algorithm to successfully defend against well known L0 adversaries in the setting of image classification. We give experimental results on the Jacobian-based Saliency Map Attack (JSMA) and the Carlini Wagner (CW) L0 attack on the MNIST and Fashion-MNIST datasets as well as the Adversarial Patch on the ImageNet dataset.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/STMLFYVT/Bafna et al. - Thwarting Adversarial Examples An $L_0$-Robust Sp.pdf}
}

@unpublished{Bahdanau2014,
  title = {Neural {{Machine Translation}} by {{Jointly Learning}} to {{Align}} and {{Translate}}},
  author = {Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
  date = {2014},
  eprint = {1409.0473},
  eprinttype = {arxiv},
  pages = {1--15},
  url = {http://arxiv.org/abs/1409.0473},
  abstract = {Neural machine translation is a recently proposed approach to machine translation. Unlike the traditional statistical machine translation, the neural machine translation aims at building a single neural network that can be jointly tuned to maximize the translation performance. The models proposed recently for neural machine translation often belong to a family of encoder-decoders and consists of an encoder that encodes a source sentence into a fixed-length vector from which a decoder generates a translation. In this paper, we conjecture that the use of a fixed-length vector is a bottleneck in improving the performance of this basic encoder-decoder architecture, and propose to extend this by allowing a model to automatically (soft-)search for parts of a source sentence that are relevant to predicting a target word, without having to form these parts as a hard segment explicitly. With this new approach, we achieve a translation performance comparable to the existing state-of-the-art phrase-based system on the task of English-to-French translation. Furthermore, qualitative analysis reveals that the (soft-)alignments found by the model agree well with our intuition.},
  file = {/Users/ryedida/Zotero/storage/IX3PGV5K/Bahdanau, Cho, Bengio - 2014 - Neural Machine Translation by Jointly Learning to Align and Translate(2).pdf}
}

@article{baiConstitutionalAIHarmlessness,
  title = {Constitutional {{AI}}: {{Harmlessness}} from {{AI Feedback}}},
  author = {Bai, Yuntao and Kadavath, Saurav and Kundu, Sandipan and Askell, Amanda and Kernion, Jackson and Jones, Andy and Chen, Anna and Goldie, Anna and Mirhoseini, Azalia and McKinnon, Cameron and Chen, Carol and Olsson, Catherine and Olah, Christopher and Hernandez, Danny and Drain, Dawn and Ganguli, Deep and Li, Dustin and Tran-Johnson, Eli and Perez, Ethan and Kerr, Jamie and Mueller, Jared and Ladish, Jeffrey and Landau, Joshua and Ndousse, Kamal and Lukosuite, Kamile and Lovitt, Liane and Sellitto, Michael and Elhage, Nelson and Schiefer, Nicholas and Mercado, Noemi and DasSarma, Nova and Lasenby, Robert and Larson, Robin and Ringer, Sam and Johnston, Scott and Kravec, Shauna and Showk, Sheer El and Fort, Stanislav and Lanham, Tamera and Telleen-Lawton, Timothy and Conerly, Tom and Henighan, Tom and Hume, Tristan and Bowman, Samuel R and Hatfield-Dodds, Zac and Mann, Ben and Amodei, Dario and Joseph, Nicholas and McCandlish, Sam and Brown, Tom and Kaplan, Jared},
  abstract = {As AI systems become more capable, we would like to enlist their help to supervise other AIs. We experiment with methods for training a harmless AI assistant through selfimprovement, without any human labels identifying harmful outputs. The only human oversight is provided through a list of rules or principles, and so we refer to the method as ‘Constitutional AI’. The process involves both a supervised learning and a reinforcement learning phase. In the supervised phase we sample from an initial model, then generate self-critiques and revisions, and then finetune the original model on revised responses. In the RL phase, we sample from the finetuned model, use a model to evaluate which of the two samples is better, and then train a preference model from this dataset of AI preferences. We then train with RL using the preference model as the reward signal, i.e. we use ‘RL from AI Feedback’ (RLAIF). As a result we are able to train a harmless but nonevasive AI assistant that engages with harmful queries by explaining its objections to them. Both the SL and RL methods can leverage chain-of-thought style reasoning to improve the human-judged performance and transparency of AI decision making. These methods make it possible to control AI behavior more precisely and with far fewer human labels.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/Y6KNVGPF/Bai et al. - Constitutional AI Harmlessness from AI Feedback.pdf}
}

@unpublished{baker2016designing,
  title = {Designing Neural Network Architectures Using Reinforcement Learning},
  author = {Baker, Bowen and Gupta, Otkrist and Naik, Nikhil and Raskar, Ramesh},
  date = {2016},
  eprint = {1611.02167},
  eprinttype = {arxiv}
}

@article{baldassiSubdominantDenseClusters2015,
  title = {Subdominant {{Dense Clusters Allow}} for {{Simple Learning}} and {{High Computational Performance}} in {{Neural Networks}} with {{Discrete Synapses}}},
  author = {Baldassi, Carlo and Ingrosso, Alessandro and Lucibello, Carlo and Saglietti, Luca and Zecchina, Riccardo},
  date = {2015-09-18},
  journaltitle = {Physical Review Letters},
  shortjournal = {Phys. Rev. Lett.},
  volume = {115},
  number = {12},
  pages = {128101},
  issn = {0031-9007, 1079-7114},
  doi = {10.1103/PhysRevLett.115.128101},
  url = {https://link.aps.org/doi/10.1103/PhysRevLett.115.128101},
  urldate = {2023-11-25},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/QHY4LLRA/Baldassi et al. - 2015 - Subdominant Dense Clusters Allow for Simple Learni.pdf}
}

@article{baldassiUnreasonableEffectivenessLearning2016,
  title = {Unreasonable Effectiveness of Learning Neural Networks: {{From}} Accessible States and Robust Ensembles to Basic Algorithmic Schemes},
  shorttitle = {Unreasonable Effectiveness of Learning Neural Networks},
  author = {Baldassi, Carlo and Borgs, Christian and Chayes, Jennifer T. and Ingrosso, Alessandro and Lucibello, Carlo and Saglietti, Luca and Zecchina, Riccardo},
  date = {2016-11-29},
  journaltitle = {Proceedings of the National Academy of Sciences},
  volume = {113},
  number = {48},
  pages = {E7655-E7662},
  publisher = {{Proceedings of the National Academy of Sciences}},
  doi = {10.1073/pnas.1608103113},
  url = {https://www.pnas.org/doi/full/10.1073/pnas.1608103113},
  urldate = {2023-11-25},
  abstract = {In artificial neural networks, learning from data is a computationally demanding task in which a large number of connection weights are iteratively tuned through stochastic-gradient-based heuristic processes over a cost function. It is not well understood how learning occurs in these systems, in particular how they avoid getting trapped in configurations with poor computational performance. Here, we study the difficult case of networks with discrete weights, where the optimization landscape is very rough even for simple architectures, and provide theoretical and numerical evidence of the existence of rare—but extremely dense and accessible—regions of configurations in the network weight space. We define a measure, the robust ensemble (RE), which suppresses trapping by isolated configurations and amplifies the role of these dense regions. We analytically compute the RE in some exactly solvable models and also provide a general algorithmic scheme that is straightforward to implement: define a cost function given by a sum of a finite number of replicas of the original cost function, with a constraint centering the replicas around a driving assignment. To illustrate this, we derive several powerful algorithms, ranging from Markov Chains to message passing to gradient descent processes, where the algorithms target the robust dense states, resulting in substantial improvements in performance. The weak dependence on the number of precision bits of the weights leads us to conjecture that very similar reasoning applies to more conventional neural networks. Analogous algorithmic schemes can also be applied to other optimization problems.},
  file = {/Users/ryedida/Zotero/storage/RWBR7SBH/Baldassi et al_2016_Unreasonable effectiveness of learning neural networks.pdf}
}

@article{bansalFunctionCallGraph2023,
  title = {Function {{Call Graph Context Encoding}} for {{Neural Source Code Summarization}}},
  author = {Bansal, Aakash and Eberhart, Zachary and Karas, Zachary and Huang, Yu and McMillan, Collin},
  date = {2023-09},
  journaltitle = {IEEE Transactions on Software Engineering},
  shortjournal = {IIEEE Trans. Software Eng.},
  volume = {49},
  number = {9},
  pages = {4268--4281},
  issn = {0098-5589, 1939-3520, 2326-3881},
  doi = {10.1109/TSE.2023.3279774},
  url = {https://ieeexplore.ieee.org/document/10132550/},
  urldate = {2023-10-06},
  abstract = {Source code summarization is the task of writing natural language descriptions of source code. The primary use of these descriptions is in documentation for programmers. Automatic generation of these descriptions is a high value research target due to the time cost to programmers of writing these descriptions themselves. In recent years, a confluence of software engineering and artificial intelligence research has made inroads into automatic source code summarization through applications of neural models of that source code. However, an Achilles’ heel to a vast majority of approaches is that they tend to rely solely on the context provided by the source code being summarized. But empirical studies in program comprehension are quite clear that the information needed to describe code much more often resides in the context in the form of Function Call Graph surrounding that code. In this paper, we present a technique for encoding this call graph context for neural models of code summarization. We implement our approach as a supplement to existing approaches, and show statistically significant improvement over existing approaches. In a human study with 20 programmers, we show that programmers perceive generated summaries to generally be as accurate, readable, and concise as human-written summaries.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/3NZWLML7/Bansal et al. - 2023 - Function Call Graph Context Encoding for Neural So.pdf}
}

@article{bartlettNearlytightVCdimensionPseudodimension2019,
  title = {Nearly-Tight {{VC-dimension}} and Pseudodimension Bounds for Piecewise Linear Neural Networks},
  author = {Bartlett, Peter L. and Harvey, Nick and Liaw, Christopher and Mehrabian, Abbas},
  date = {2019},
  journaltitle = {The Journal of Machine Learning Research},
  volume = {20},
  number = {1},
  pages = {2285--2301},
  publisher = {{JMLR. org}},
  isbn = {1532-4435},
  file = {/Users/ryedida/Zotero/storage/UB29UM8C/Bartlett et al_2019_Nearly-tight VC-dimension and pseudodimension bounds for piecewise linear.pdf}
}

@article{Bastani2016,
  title = {Measuring Neural Net Robustness with Constraints},
  author = {Bastani, Osbert and Ioannou, Yani and Lampropoulos, Leonidas and Vytiniotis, Dimitrios and Nori, Aditya V. and Criminisi, Antonio},
  date = {2016},
  journaltitle = {Advances in Neural Information Processing Systems},
  eprint = {1605.07262},
  eprinttype = {arxiv},
  pages = {2621--2629},
  issn = {10495258},
  abstract = {Despite having high accuracy, neural nets have been shown to be susceptible to adversarial examples, where a small perturbation to an input can cause it to become mislabeled. We propose metrics for measuring the robustness of a neural net and devise a novel algorithm for approximating these metrics based on an encoding of robustness as a linear program. We show how our metrics can be used to evaluate the robustness of deep neural nets with experiments on the MNIST and CIFAR-10 datasets. Our algorithm generates more informative estimates of robustness metrics compared to estimates based on existing algorithms. Furthermore, we show how existing approaches to improving robustness "overfit" to adversarial examples generated using a specific algorithm. Finally, we show that our techniques can be used to additionally improve neural net robustness both according to the metrics that we propose, but also according to previously proposed metrics.},
  issue = {Nips},
  file = {/Users/ryedida/Zotero/storage/6RF9Q3A9/Bastani et al. - 2016 - Measuring neural net robustness with constraints(2).pdf}
}

@article{Bastani2018,
  title = {Verifiable Reinforcement Learning via Policy Extraction},
  author = {Bastani, Osbert and Pu, Yewen and Solar-Lezama, Armando},
  date = {2018},
  journaltitle = {Advances in Neural Information Processing Systems},
  volume = {2018-Decem},
  eprint = {1805.08328},
  eprinttype = {arxiv},
  pages = {2494--2504},
  issn = {10495258},
  abstract = {While deep reinforcement learning has successfully solved many challenging control tasks, its real-world applicability has been limited by the inability to ensure the safety of learned policies. We propose an approach to verifiable reinforcement learning by training decision tree policies, which can represent complex policies (since they are nonparametric), yet can be efficiently verified using existing techniques (since they are highly structured). The challenge is that decision tree policies are difficult to train. We propose VIPER, an algorithm that combines ideas from model compression and imitation learning to learn decision tree policies guided by a DNN policy (called the oracle) and its Q-function, and show that it substantially outperforms two baselines. We use VIPER to (i) learn a provably robust decision tree policy for a variant of Atari Pong with a symbolic state space, (ii) learn a decision tree policy for a toy game based on Pong that provably never loses, and (iii) learn a provably stable decision tree policy for cart-pole. In each case, the decision tree policy achieves performance equal to that of the original DNN policy.},
  issue = {NeurIPS},
  file = {/Users/ryedida/Zotero/storage/HJ7Y4WBY/Bastani, Pu, Solar-Lezama - 2018 - Verifiable reinforcement learning via policy extraction(2).pdf}
}

@article{bentley75,
  title = {Multidimensional Binary Search Trees Used for Associative Searching},
  author = {Bentley, Jon Louis},
  date = {1975-09},
  journaltitle = {Communications of The Acm},
  shortjournal = {Commun. ACM},
  volume = {18},
  number = {9},
  pages = {509--517},
  publisher = {{Association for Computing Machinery}},
  location = {{New York, NY, USA}},
  issn = {0001-0782},
  doi = {10.1145/361002.361007},
  url = {https://doi.org/10.1145/361002.361007},
  abstract = {This paper develops the multidimensional binary search tree (or k-d tree, where k is the dimensionality of the search space) as a data structure for storage of information to be retrieved by associative searches. The k-d tree is defined and examples are given. It is shown to be quite efficient in its storage requirements. A significant advantage of this structure is that a single data structure can handle many types of queries very efficiently. Various utility algorithms are developed; their proven average running times in an n record file are: insertion, O(log n); deletion of the root, O(n(k-1)/k); deletion of a random node, O(log n); and optimization (guarantees logarithmic performance of searches), O(n log n). Search algorithms are given for partial match queries with t keys specified [proven maximum running time of O(n(k-t)/k)] and for nearest neighbor queries [empirically observed average running time of O(log n).] These performances far surpass the best currently known algorithms for these tasks. An algorithm is presented to handle any general intersection query. The main focus of this paper is theoretical. It is felt, however, that k-d trees could be quite useful in many applications, and examples of potential uses are given.},
  issue_date = {Sept. 1975},
  pagetotal = {9},
  keywords = {associative retrieval,attribute,binary search trees,binary tree insertion,information retrieval system,intersection queries,key,nearest neighbor queries,partial match queries}
}

@article{bergstra2012random,
  title = {Random Search for Hyper-Parameter Optimization},
  author = {Bergstra, James and Bengio, Yoshua},
  date = {2012},
  journaltitle = {Journal of machine learning research},
  volume = {13},
  pages = {281--305},
  issue = {Feb}
}

@inproceedings{bergstra2013hyperopt,
  title = {Hyperopt: {{A}} Python Library for Optimizing the Hyperparameters of Machine Learning Algorithms},
  booktitle = {Proceedings of the 12th {{Python}} in Science Conference},
  author = {Bergstra, James and Yamins, Dan and Cox, David D and others},
  date = {2013},
  volume = {13},
  pages = {20},
  publisher = {{Citeseer}}
}

@article{bergstra2013making,
  title = {Making a Science of Model Search: {{Hyperparameter}} Optimization in Hundreds of Dimensions for Vision Architectures},
  author = {Bergstra, James and Yamins, Daniel and Cox, David Daniel},
  date = {2013},
  publisher = {{Jmlr}}
}

@article{bergstraAlgorithmsHyperParameterOptimization,
  title = {Algorithms for {{Hyper-Parameter Optimization}}},
  author = {Bergstra, James S and Bardenet, Rémi and Bengio, Yoshua and Kégl, Balázs},
  abstract = {Several recent advances to the state of the art in image classification benchmarks have come from better configurations of existing techniques rather than novel approaches to feature learning. Traditionally, hyper-parameter optimization has been the job of humans because they can be very efficient in regimes where only a few trials are possible. Presently, computer clusters and GPU processors make it possible to run more trials and we show that algorithmic approaches can find better results. We present hyper-parameter optimization results on tasks of training neural networks and deep belief networks (DBNs). We optimize hyper-parameters using random search and two new greedy sequential methods based on the expected improvement criterion. Random search has been shown to be sufficiently efficient for learning neural networks for several datasets, but we show it is unreliable for training DBNs. The sequential algorithms are applied to the most difficult DBN learning problems from [1] and find significantly better results than the best previously reported. This work contributes novel techniques for making response surface models P (y|x) in which many elements of hyper-parameter assignment (x) are known to be irrelevant given particular values of other elements.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/AVQ7LGWB/Bergstra et al. - Algorithms for Hyper-Parameter Optimization.pdf}
}

@article{bertolottiCombTransformersStatementWiseTransformers2023,
  title = {{{CombTransformers}}: {{Statement-Wise Transformers}} for {{Statement-Wise Representations}}},
  shorttitle = {{{CombTransformers}}},
  author = {Bertolotti, Francesco and Cazzola, Walter},
  date = {2023},
  journaltitle = {IEEE Transactions on Software Engineering},
  shortjournal = {IIEEE Trans. Software Eng.},
  pages = {1--13},
  issn = {0098-5589, 1939-3520, 2326-3881},
  doi = {10.1109/TSE.2023.3310793},
  url = {https://ieeexplore.ieee.org/document/10242162/},
  urldate = {2023-10-10},
  abstract = {This study presents a novel category of Transformer architectures known as comb transformers, which effectively reduce the space complexity of the self-attention layer from a quadratic to a subquadratic level. This is achieved by processing sequence segments independently and incorporating 𝒳 -word embeddings to merge crosssegment information. The reduction in attention memory requirements enables the deployment of deeper architectures, potentially leading to more competitive outcomes. Furthermore, we design an abstract syntax tree (AST)-based code representation to effectively exploit comb transformer properties. To explore the potential of our approach, we develop nine specific instances based on three popular architectural concepts: funnel, hourglass, and encoder-decoder. These architectures are subsequently trained on three code-related tasks: method name generation, code search, and code summarization. These tasks encompass a range of capabilities: short/long sequence generation and classification. In addition to the proposed comb transformers, we also evaluate several baseline architectures for comparative analysis. Our findings demonstrate that the comb transformers match the performance of the baselines and frequently perform better.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/K437MHDJ/Bertolotti and Cazzola - 2023 - CombTransformers Statement-Wise Transformers for .pdf}
}

@article{bischlHyperparameterOptimizationFoundations2023,
  title = {Hyperparameter Optimization: {{Foundations}}, Algorithms, Best Practices, and Open Challenges},
  shorttitle = {Hyperparameter Optimization},
  author = {Bischl, Bernd and Binder, Martin and Lang, Michel and Pielok, Tobias and Richter, Jakob and Coors, Stefan and Thomas, Janek and Ullmann, Theresa and Becker, Marc and Boulesteix, Anne‐Laure and Deng, Difan and Lindauer, Marius},
  date = {2023-03},
  journaltitle = {WIREs Data Mining and Knowledge Discovery},
  shortjournal = {WIREs Data Min \& Knowl},
  volume = {13},
  number = {2},
  pages = {e1484},
  issn = {1942-4787, 1942-4795},
  doi = {10.1002/widm.1484},
  url = {https://wires.onlinelibrary.wiley.com/doi/10.1002/widm.1484},
  urldate = {2023-11-27},
  abstract = {Most machine learning algorithms are configured by a set of hyperparameters whose values must be carefully chosen and which often considerably impact performance. To avoid a time-consuming and irreproducible manual process of trial-anderror to find well-performing hyperparameter configurations, various automatic hyperparameter optimization (HPO) methods—for example, based on resampling error estimation for supervised machine learning—can be employed. After introducing HPO from a general perspective, this paper reviews important HPO methods, from simple techniques such as grid or random search to more advanced methods like evolution strategies, Bayesian optimization, Hyperband, and racing. This work gives practical recommendations regarding important choices to be made when conducting HPO, including the HPO algorithms themselves, performance evaluation, how to combine HPO with machine learning pipelines, runtime improvements, and parallelization.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/WJNGSG9A/Bischl et al. - 2023 - Hyperparameter optimization Foundations, algorith.pdf}
}

@article{bottouOptimizationMethodsLargeScale2018,
  title = {Optimization {{Methods}} for {{Large-Scale Machine Learning}}},
  author = {Bottou, Léon and Curtis, Frank E. and Nocedal, Jorge},
  date = {2018-01},
  journaltitle = {SIAM Review},
  shortjournal = {SIAM Rev.},
  volume = {60},
  number = {2},
  pages = {223--311},
  issn = {0036-1445, 1095-7200},
  doi = {10.1137/16M1080173},
  url = {https://epubs.siam.org/doi/10.1137/16M1080173},
  urldate = {2023-11-25},
  abstract = {This paper provides a review and commentary on the past, present, and future of numerical optimization algorithms in the context of machine learning applications. Through case studies on text classification and the training of deep neural networks, we discuss how optimization problems arise in machine learning and what makes them challenging. A major theme of our study is that large-scale machine learning represents a distinctive setting in which the stochastic gradient (SG) method has traditionally played a central role while conventional gradient-based nonlinear optimization techniques typically falter. Based on this viewpoint, we present a comprehensive theory of a straightforward, yet versatile SG algorithm, discuss its practical behavior, and highlight opportunities for designing algorithms with improved performance. This leads to a discussion about the next generation of optimization methods for large-scale machine learning, including an investigation of two main streams of research on techniques that diminish noise in the stochastic directions and methods that make use of second-order derivative approximations.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/L2L67UKY/Bottou et al. - 2018 - Optimization Methods for Large-Scale Machine Learn.pdf}
}

@article{bousmanReviewConsensusPharmacogenomic2021,
  title = {Review and {{Consensus}} on {{Pharmacogenomic Testing}} in {{Psychiatry}}},
  author = {Bousman, Chad A. and Bengesser, Susanne A. and Aitchison, Katherine J. and Amare, Azmeraw T. and Aschauer, Harald and Baune, Bernhard T. and Asl, Bahareh Behroozi and Bishop, Jeffrey R. and Burmeister, Margit and Chaumette, Boris and Chen, Li-Shiun and Cordner, Zachary A. and Deckert, Jürgen and Degenhardt, Franziska and DeLisi, Lynn E. and Folkersen, Lasse and Kennedy, James L. and Klein, Teri E. and McClay, Joseph L. and McMahon, Francis J. and Musil, Richard and Saccone, Nancy L. and Sangkuhl, Katrin and Stowe, Robert M. and Tan, Ene-Choo and Tiwari, Arun K. and Zai, Clement C. and Zai, Gwyneth and Zhang, Jianping and Gaedigk, Andrea and Müller, Daniel J},
  date = {2021-01},
  journaltitle = {Pharmacopsychiatry},
  shortjournal = {Pharmacopsychiatry},
  volume = {54},
  number = {01},
  pages = {5--17},
  issn = {0176-3679, 1439-0795},
  doi = {10.1055/a-1288-1061},
  url = {http://www.thieme-connect.de/DOI/DOI?10.1055/a-1288-1061},
  urldate = {2023-10-01},
  abstract = {Abstract             The implementation of pharmacogenomic (PGx) testing in psychiatry remains modest,                     in part due to divergent perceptions of the quality and completeness of the                     evidence base and diverse perspectives on the clinical utility of PGx testing                     among psychiatrists and other healthcare providers. Recognizing the current lack                     of consensus within the field, the International Society of Psychiatric Genetics                     assembled a group of experts to conduct a narrative synthesis of the PGx                     literature, prescribing guidelines, and product labels related to psychotropic                     medications as well as the key considerations and limitations related to the use                     of PGx testing in psychiatry. The group concluded that to inform medication                     selection and dosing of several commonly-used antidepressant and antipsychotic                     medications, current published evidence, prescribing guidelines, and product                     labels support the use of PGx testing for 2 cytochrome P450 genes (CYP2D6,                         CYP2C19). In addition, the evidence supports testing for human leukocyte                     antigen genes when using the mood stabilizers carbamazepine (HLA-A and                         HLA-B), oxcarbazepine (HLA-B), and phenytoin (CYP2C9, HLA-B). For                     valproate, screening for variants in certain genes (POLG, OTC, CSP1) is                     recommended when a mitochondrial disorder or a urea cycle disorder is suspected.                     Although barriers to implementing PGx testing remain to be fully resolved, the                     current trajectory of discovery and innovation in the field suggests these                     barriers will be overcome and testing will become an important tool in                     psychiatry.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/GU8LEF4L/Bousman et al. - 2021 - Review and Consensus on Pharmacogenomic Testing in.pdf}
}

@article{bousquetStabilityGeneralization,
  title = {Stability and {{Generalization}}},
  author = {Bousquet, Olivier and Elisseeff, Andre},
  abstract = {We define notions of stability for learning algorithms and show how to use these notions to derive generalization error bounds based on the empirical error and the leave-one-out error. The methods we use can be applied in the regression framework as well as in the classification one when the classifier is obtained by thresholding a real-valued function. We study the stability properties of large classes of learning algorithms such as regularization based algorithms. In particular we focus on Hilbert space regularization and Kullback-Leibler regularization. We demonstrate how to apply the results to SVM for regression and classification.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/2LPDAISG/Bousquet and Elisseeﬀ - Stability and Generalization.pdf}
}

@article{boxAnalysisTransformations1964,
  title = {An Analysis of Transformations},
  author = {Box, George EP and Cox, David R.},
  date = {1964},
  journaltitle = {Journal of the Royal Statistical Society Series B: Statistical Methodology},
  volume = {26},
  number = {2},
  pages = {211--243},
  publisher = {{Oxford University Press}},
  isbn = {1369-7412},
  file = {/Users/ryedida/Zotero/storage/3XSUIZDT/Box_Cox_1964_An analysis of transformations.pdf}
}

@book{boydConvexOptimization2004,
  title = {Convex Optimization},
  author = {Boyd, Stephen P. and Vandenberghe, Lieven},
  date = {2004},
  publisher = {{Cambridge University Press}},
  location = {{Cambridge, UK ; New York}},
  isbn = {978-0-521-83378-3},
  langid = {english},
  pagetotal = {716},
  keywords = {Convex functions,Mathematical optimization},
  file = {/Users/ryedida/Zotero/storage/395FSFM5/Boyd and Vandenberghe - 2004 - Convex optimization.pdf}
}

@article{brayStatisticsCriticalPoints2007,
  title = {Statistics of {{Critical Points}} of {{Gaussian Fields}} on {{Large-Dimensional Spaces}}},
  author = {Bray, Alan J. and Dean, David S.},
  date = {2007-04-10},
  journaltitle = {Physical Review Letters},
  shortjournal = {Phys. Rev. Lett.},
  volume = {98},
  number = {15},
  pages = {150201},
  issn = {0031-9007, 1079-7114},
  doi = {10.1103/PhysRevLett.98.150201},
  url = {https://link.aps.org/doi/10.1103/PhysRevLett.98.150201},
  urldate = {2023-11-26},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/IXFDBPQI/Bray and Dean - 2007 - Statistics of Critical Points of Gaussian Fields o.pdf}
}

@article{Brendel2019,
  title = {Approximating {{CNNs}} with {{Bag-of-local-Features}} Models Works Surprisingly Well on {{ImageNet}}},
  author = {Brendel, Wieland and Bethge, Matthias},
  date = {2019},
  pages = {1--15},
  file = {/Users/ryedida/Zotero/storage/FDTI67MM/Brendel, Bethge - 2019 - Approximating CNNs with Bag-of-local-Features models works surprisingly well on ImageNet(2).pdf}
}

@unpublished{brown2020language,
  title = {Language {{Models}} Are {{Few-Shot Learners}}},
  author = {Brown, Tom B and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
  date = {2020},
  eprint = {2005.14165},
  eprinttype = {arxiv}
}

@article{brunetUnderstandingOriginsBias,
  title = {Understanding the {{Origins}} of {{Bias}} in {{Word Embeddings}}},
  author = {Brunet, Marc-Etienne and Alkalay-Houlihan, Colleen and Anderson, Ashton and Zemel, Richard},
  abstract = {Popular word embedding algorithms exhibit stereotypical biases, such as gender bias. The widespread use of these algorithms in machine learning systems can thus amplify stereotypes in important contexts. Although some methods have been developed to mitigate this problem, how word embedding biases arise during training is poorly understood. In this work, we develop a technique to address this question. Given a word embedding, our method reveals how perturbing the training corpus would affect the resulting embedding bias. By tracing the origins of word embedding bias back to the original training documents, one can identify subsets of documents whose removal would most reduce bias. We demonstrate our methodology on Wikipedia and New York Times corpora, and find it to be very accurate.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/W5IUV5HA/Brunet et al. - Understanding the Origins of Bias in Word Embeddin.pdf}
}

@inproceedings{Bubeck2019,
  title = {Adversarial Examples from Computational Constraints},
  booktitle = {36th {{International Conference}} on {{Machine Learning}}, {{ICML}} 2019},
  author = {Bubeck, Sébastien and Lee, Yin Tat and Price, Eric and Razenshteyn, Ilya},
  date = {2019},
  volume = {2019-June},
  eprint = {1805.10204},
  eprinttype = {arxiv},
  pages = {1345--1360},
  abstract = {Why are classifiers in high dimension vulnerable to "adversarial" perturbations? We show that it is likely not due to information theoretic limitations, but rather it could be due to computational constraints. First we prove that, for a broad set of classification tasks, the mere existence of a robust classifier implies that it can be found by a possibly exponential-time algorithm with relatively few training examples. Then we give two particular classification tasks where learning a robust classifier is computationally intractable. More precisely we construct two binary classifications task in high dimensional space which are (i) information theoretically easy to learn robustly for large perturbations, (ii) efficiently leamable (non-robustly) by a simple linear separator, (iii) yet are not efficiently robustly learnable, even for small perturbations. Specifically, for the first task hardness holds for any efficient algorithm in the statistical query (SQ) model, while for the second task we rule out any efficient algorithm under a cryptographic assumption. These examples give an exponential separation between classical learning and robust learning in the statistical query model or under a cryptographic assumption. It suggests that adversarial examples may be an unavoidable byproduct of computational limitations of learning algorithms.},
  isbn = {978-1-5108-8698-8},
  file = {/Users/ryedida/Zotero/storage/29JI96UH/Bubeck et al. - 2019 - Adversarial examples from computational constraints(2).pdf}
}

@article{bubeckConvexOptimizationAlgorithms2015,
  title = {Convex {{Optimization}}: {{Algorithms}} and {{Complexity}}},
  shorttitle = {Convex {{Optimization}}},
  author = {Bubeck, Sébastien},
  date = {2015-11-11},
  journaltitle = {Foundations and Trends® in Machine Learning},
  shortjournal = {MAL},
  volume = {8},
  number = {3-4},
  pages = {231--357},
  publisher = {{Now Publishers, Inc.}},
  issn = {1935-8237, 1935-8245},
  doi = {10.1561/2200000050},
  url = {https://www.nowpublishers.com/article/Details/MAL-050},
  urldate = {2023-04-30},
  abstract = {Convex Optimization: Algorithms and Complexity},
  langid = {english},
  annotation = {1477 citations (Semantic Scholar/DOI) [2023-04-29]},
  file = {/Users/ryedida/Zotero/storage/KCN63ZA7/Bubeck - 2015 - Convex Optimization Algorithms and Complexity.pdf}
}

@article{budaSystematicStudyClass2018,
  title = {A Systematic Study of the Class Imbalance Problem in Convolutional Neural Networks},
  author = {Buda, Mateusz and Maki, Atsuto and Mazurowski, Maciej A.},
  date = {2018},
  journaltitle = {Neural Networks},
  volume = {106},
  eprint = {23766329},
  eprinttype = {pmid},
  pages = {249--259},
  issn = {18792782},
  doi = {10.1016/j.neunet.2018.07.011},
  abstract = {In this study, we systematically investigate the impact of class imbalance on classification performance of convolutional neural networks (CNNs) and compare frequently used methods to address the issue. Class imbalance is a common problem that has been comprehensively studied in classical machine learning, yet very limited systematic research is available in the context of deep learning. In our study, we use three benchmark datasets of increasing complexity, MNIST, CIFAR-10 and ImageNet, to investigate the effects of imbalance on classification and perform an extensive comparison of several methods to address the issue: oversampling, undersampling, two-phase training, and thresholding that compensates for prior class probabilities. Our main evaluation metric is area under the receiver operating characteristic curve (ROC AUC) adjusted to multi-class tasks since overall accuracy metric is associated with notable difficulties in the context of imbalanced data. Based on results from our experiments we conclude that (i) the effect of class imbalance on classification performance is detrimental; (ii) the method of addressing class imbalance that emerged as dominant in almost all analyzed scenarios was oversampling; (iii) oversampling should be applied to the level that completely eliminates the imbalance, whereas the optimal undersampling ratio depends on the extent of imbalance; (iv) as opposed to some classical machine learning models, oversampling does not cause overfitting of CNNs; (v) thresholding should be applied to compensate for prior class probabilities when overall number of properly classified cases is of interest.},
  isbn = {0011-3891},
  keywords = {Class imbalance,Convolutional neural networks,Deep learning,Image classification},
  file = {/Users/ryedida/Zotero/storage/UPF5CNFC/Buda, Maki, Mazurowski - 2018 - A systematic study of the class imbalance problem in convolutional neural networks(2).pdf}
}

@article{bunelUnifiedViewPiecewise,
  title = {A {{Unified View}} of {{Piecewise Linear Neural Network Verification}}},
  author = {Bunel, Rudy R and Turkaslan, Ilker and Torr, Philip and Kohli, Pushmeet and Mudigonda, Pawan K},
  abstract = {The success of Deep Learning and its potential use in many safety-critical applications has motivated research on formal verification of Neural Network (NN) models. Despite the reputation of learned NN models to behave as black boxes and the theoretical hardness of proving their properties, researchers have been successful in verifying some classes of models by exploiting their piecewise linear structure and taking insights from formal methods such as Satisifiability Modulo Theory. These methods are however still far from scaling to realistic neural networks. To facilitate progress on this crucial area, we make two key contributions. First, we present a unified framework that encompasses previous methods. This analysis results in the identification of new methods that combine the strengths of multiple existing approaches, accomplishing a speedup of two orders of magnitude compared to the previous state of the art. Second, we propose a new data set of benchmarks which includes a collection of previously released testcases. We use the benchmark to provide the first experimental comparison of existing algorithms and identify the factors impacting the hardness of verification problems.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/FEYG9TNJ/Bunel et al. - A Unified View of Piecewise Linear Neural Network .pdf}
}

@article{buntineBayesianBackpropagation1991,
  title = {Bayesian Backpropagation},
  author = {Buntine, Wray L.},
  date = {1991},
  journaltitle = {Complex systems},
  volume = {5},
  pages = {603--643},
  file = {/Users/ryedida/Zotero/storage/J9IVN3BJ/Buntine_1991_Bayesian backpropagation.pdf}
}

@article{byrdtLIMITEDMEMORYALGORITHM,
  title = {A {{LIMITED MEMORY ALGORITHM FOR BOUND CONSTRAINED OPTIMIZATION}}},
  author = {BYRDt, RICHARD H and family=LUt, given=PEIHUANG, given-i=PEIHUANG and family=NOCEDALt, given=JORGE, given-i=JORGE},
  abstract = {An algorithm for solving large nonlinear optimization problems with simple bounds is described. It is based on the gradient projection method and uses a limited memory BFGS matrix to approximate the Hessian of the objective function. It is shown how to take advantage of the form of the limited memory approximation to implement the algorithm efficiently. The results of numerical tests on a set of large problems are reported.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/YKVJVKM3/BYRDt et al. - A LIMITED MEMORY ALGORITHM FOR BOUND CONSTRAINED O.pdf}
}

@article{caliskanSemanticsDerivedAutomatically2017,
  title = {Semantics Derived Automatically from Language Corpora Contain Human-like Biases},
  author = {Caliskan, Aylin and Bryson, Joanna J. and Narayanan, Arvind},
  date = {2017-04-14},
  journaltitle = {Science},
  volume = {356},
  number = {6334},
  pages = {183--186},
  publisher = {{American Association for the Advancement of Science}},
  doi = {10.1126/science.aal4230},
  url = {https://www.science.org/doi/full/10.1126/science.aal4230},
  urldate = {2023-12-09},
  abstract = {Machine learning is a means to derive artificial intelligence by discovering patterns in existing data. Here, we show that applying machine learning to ordinary human language results in human-like semantic biases. We replicated a spectrum of known biases, as measured by the Implicit Association Test, using a widely used, purely statistical machine-learning model trained on a standard corpus of text from the World Wide Web. Our results indicate that text corpora contain recoverable and accurate imprints of our historic biases, whether morally neutral as toward insects or flowers, problematic as toward race or gender, or even simply veridical, reflecting the status quo distribution of gender with respect to careers or first names. Our methods hold promise for identifying and addressing sources of bias in culture, including technology.},
  file = {/Users/ryedida/Zotero/storage/INBT4644/Caliskan et al_2017_Semantics derived automatically from language corpora contain human-like biases.pdf}
}

@inproceedings{Canavan2018,
  title = {Combining Gaze and Demographic Feature Descriptors for Autism Classification},
  booktitle = {Proceedings - {{International Conference}} on {{Image Processing}}, {{ICIP}}},
  author = {Canavan, Shaun and Chen, Melanie and Chen, Song and Valdez, Robert and Yaeger, Miles and Lin, Huiyi and Yin, Lijun},
  date = {2018},
  volume = {2017-Septe},
  pages = {3750--3754},
  issn = {15224880},
  doi = {10.1109/ICIP.2017.8296983},
  abstract = {People with autism suffer from social challenges and communication difficulties, which may prevent them from leading a fruitful and enjoyable life. It is imperative to diagnose and start treatments for autism as early as possible and, in order to do so, accurate methods of identifying the disorder are vital. We propose a novel method for classifying autism through the use of eye gaze and demographic feature descriptors that include a subject's age and gender. We construct feature descriptors that incorporate the subject's age and gender, as well as features based on eye gaze data. Using eye gaze information from the National Database for Autism Research, we tested our constructed feature descriptors on three different classifiers; random regression forests, C4.5 decision tree, and PART. Our proposed method for classifying autism resulted in a top classification rate of 96.2\%.},
  isbn = {978-1-5090-2175-8},
  keywords = {Autism,Classification,Gaze},
  file = {/Users/ryedida/Zotero/storage/NBAYM2U3/Canavan et al. - 2018 - Combining gaze and demographic feature descriptors for autism classification(2).pdf}
}

@unpublished{Carlini2017,
  title = {Provably {{Minimally-Distorted Adversarial Examples}}},
  author = {Carlini, Nicholas and Katz, Guy and Barrett, Clark and Dill, David L.},
  date = {2017},
  eprint = {1709.10207},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1709.10207},
  abstract = {The ability to deploy neural networks in real-world, safety-critical systems is severely limited by the presence of adversarial examples: slightly perturbed inputs that are misclassified by the network. In recent years, several techniques have been proposed for increasing robustness to adversarial examples --- and yet most of these have been quickly shown to be vulnerable to future attacks. For example, over half of the defenses proposed by papers accepted at ICLR 2018 have already been broken. We propose to address this difficulty through formal verification techniques. We show how to construct provably minimally distorted adversarial examples: given an arbitrary neural network and input sample, we can construct adversarial examples which we prove are of minimal distortion. Using this approach, we demonstrate that one of the recent ICLR defense proposals, adversarial retraining, provably succeeds at increasing the distortion required to construct adversarial examples by a factor of 4.2.},
  file = {/Users/ryedida/Zotero/storage/MUBPDG33/Carlini et al. - 2017 - Provably Minimally-Distorted Adversarial Examples(2).pdf}
}

@article{Carlini2017a,
  title = {Towards {{Evaluating}} the {{Robustness}} of {{Neural Networks}}},
  author = {Carlini, Nicholas and Wagner, David},
  date = {2017},
  journaltitle = {Proceedings - IEEE Symposium on Security and Privacy},
  eprint = {1608.04644},
  eprinttype = {arxiv},
  pages = {39--57},
  issn = {10816011},
  doi = {10.1109/SP.2017.49},
  abstract = {Neural networks provide state-of-the-art results for most machine learning tasks. Unfortunately, neural networks are vulnerable to adversarial examples: given an input x and any target classification t, it is possible to find a new input x' that is similar to x but classified as t. This makes it difficult to apply neural networks in security-critical areas. Defensive distillation is a recently proposed approach that can take an arbitrary neural network, and increase its robustness, reducing the success rate of current attacks' ability to find adversarial examples from 95\% to 0.5\%.In this paper, we demonstrate that defensive distillation does not significantly increase the robustness of neural networks by introducing three new attack algorithms that are successful on both distilled and undistilled neural networks with 100\% probability. Our attacks are tailored to three distance metrics used previously in the literature, and when compared to previous adversarial example generation algorithms, our attacks are often much more effective (and never worse). Furthermore, we propose using high-confidence adversarial examples in a simple transferability test we show can also be used to break defensive distillation. We hope our attacks will be used as a benchmark in future defense attempts to create neural networks that resist adversarial examples.},
  isbn = {9781509055326},
  file = {/Users/ryedida/Zotero/storage/J4L2JXUD/Carlini, Wagner - 2017 - Towards Evaluating the Robustness of Neural Networks(2).pdf}
}

@article{Carlini2017b,
  title = {Adversarial Examples Are Not Easily Detected: {{Bypassing}} Ten Detection Methods},
  author = {Carlini, Nicholas and Wagner, David},
  date = {2017},
  journaltitle = {AISec 2017 - Proceedings of the 10th ACM Workshop on Artificial Intelligence and Security, co-located with CCS 2017},
  eprint = {1705.07263},
  eprinttype = {arxiv},
  pages = {3--14},
  doi = {10.1145/3128572.3140444},
  abstract = {Neural networks are known to be vulnerable to adversarial examples: inputs that are close to natural inputs but classified incorrectly. In order to better understand the space of adversarial examples, we survey ten recent proposals that are designed for detection and compare their efficacy. We show that all can be defeated by constructing new loss functions. We conclude that adversarial examples are significantly harder to detect than previously appreciated, and the properties believed to be intrinsic to adversarial examples are in fact not. Finally, we propose several simple guidelines for evaluating future proposed defenses.},
  isbn = {9781450352024},
  file = {/Users/ryedida/Zotero/storage/RI4XPNB5/Carlini, Wagner - 2017 - Adversarial examples are not easily detected Bypassing ten detection methods(2).pdf}
}

@unpublished{carliniEvaluatingAdversarialRobustness2019,
  title = {On {{Evaluating Adversarial Robustness}}},
  author = {Carlini, Nicholas and Athalye, Anish and Papernot, Nicolas and Brendel, Wieland and Rauber, Jonas and Tsipras, Dimitris and Goodfellow, Ian and Madry, Aleksander and Kurakin, Alexey},
  date = {2019-02-20},
  eprint = {1902.06705},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1902.06705},
  urldate = {2021-04-16},
  abstract = {Correctly evaluating defenses against adversarial examples has proven to be extremely difficult. Despite the significant amount of recent work attempting to design defenses that withstand adaptive attacks, few have succeeded; most papers that propose defenses are quickly shown to be incorrect.},
  langid = {english},
  keywords = {Computer Science - Cryptography and Security,Computer Science - Machine Learning,Statistics - Machine Learning},
  annotation = {277 citations (Semantic Scholar/arXiv) [2021-04-16]},
  file = {/Users/ryedida/Zotero/storage/JMA7HN3J/Carlini et al. - 2019 - On Evaluating Adversarial Robustness.pdf}
}

@article{Carton2016,
  title = {Identifying {{Police Officers}} at {{Risk}} of {{Adverse Events}}},
  author = {Carton, Samuel and Ghani, Rayid and Helsby, Jennifer and Joseph, Kenneth and Mahmud, Ayesha and Park, Youngsoo and Walsh, Joe and Cody, Crystal and Patterson, CPT Estella and Haynes, Lauren},
  date = {2016},
  journaltitle = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining - KDD '16},
  pages = {67--76},
  doi = {10.1145/2939672.2939698},
  url = {http://dl.acm.org/citation.cfm?doid=2939672.2939698},
  abstract = {Adverse events between police and the public, such as deadly shootings or instances of racial profiling, can cause serious or deadly harm, damage police legitimacy, and result in costly litigation. Evidence suggests these events can be prevented by targeting interventions based on an Early Intervention System (EIS) that flags police officers who are at a high risk for involvement in such adverse events. Today's EIS are not data-driven and typically rely on simple thresholds based entirely on expert intuition. In this paper, we de-scribe our work with the Charlotte-Mecklenburg Police De-partment (CMPD) to develop a machine learning model to predict which officers are at risk for an adverse event. Our approach significantly outperforms CMPD's existing EIS, increasing true positives by ∼ 12\% and decreasing false pos-itives by ∼ 32\%. Our work also sheds light on features re-lated to officer characteristics, situational factors, and neigh-borhood factors that are predictive of adverse events. This work provides a starting point for police departments to take a comprehensive, data-driven approach to improve policing and reduce harm to both officers and members of the public.},
  isbn = {9781450342322},
  file = {/Users/ryedida/Zotero/storage/Y95YZTR6/Carton et al. - 2016 - Identifying Police Officers at Risk of Adverse Events(2).pdf}
}

@unpublished{Caruana2016,
  title = {Model {{Compression}}},
  author = {Caruana, Rich},
  date = {2016},
  eprint = {23459267},
  eprinttype = {pmid},
  pages = {1--2},
  issn = {0004-6361},
  doi = {10.1145/1150402.1150464},
  abstract = {Often the best performing supervised learning models are ensembles of hundreds or thousands of base-level classifiers. Unfortunately, the space required to store this many clas-sifiers, and the time required to execute them at run-time, prohibits their use in applications where test sets are large (e.g. Google), where storage space is at a premium (e.g. PDAs), and where computational power is limited (e.g. hea-ring aids). We present a method for " compressing " large, complex ensembles into smaller, faster models, usually with-out significant loss in performance.},
  isbn = {1595933395},
  keywords = {Algorithms,Categories and Subject Descriptors,Experimentation,I51 [Pattern Re-cognition],Measure-ment,Model Compression,Models – Neural nets General Terms,Performance,Reliability Keywords,Supervised Learning},
  file = {/Users/ryedida/Zotero/storage/QR7SHRFJ/Caruana - 2016 - Model Compression(2).pdf}
}

@online{CaseStudyResearch,
  title = {Case {{Study Research}} in {{Software Engineering}}—{{It}} Is a {{Case}}, and It Is a {{Study}}, but Is It a {{Case Study}}? | {{Elsevier Enhanced Reader}}},
  shorttitle = {Case {{Study Research}} in {{Software Engineering}}—{{It}} Is a {{Case}}, and It Is a {{Study}}, but Is It a {{Case Study}}?},
  doi = {10.1016/j.infsof.2021.106514},
  url = {https://reader.elsevier.com/reader/sd/pii/S0950584921000033?token=F9549242D58EF833EE89BDBA2C375FBE98D9CB7F62814784AAA62F57E2ACA75D9B7C8798B2DAA453F25510B92B0E1542&originRegion=eu-west-1&originCreation=20211223154647},
  urldate = {2021-12-23},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/85HHKFSC/Case Study Research in Software Engineering—It is .pdf;/Users/ryedida/Zotero/storage/P7SRTMPK/S0950584921000033.html}
}

@report{chanpuriyaDeepWalkingBackwardsEmbeddings,
  title = {{{DeepWalking Backwards}}: {{From Embeddings Back}} to {{Graphs}}},
  author = {Chanpuriya, Sudhanshu and Musco, Cameron and Sotiropoulos, Konstantinos and Tsourakakis, Charalampos E},
  eprint = {2102.08532v1},
  eprinttype = {arxiv},
  abstract = {Low-dimensional node embeddings play a key role in analyzing graph datasets. However, little work studies exactly what information is encoded by popular embedding methods, and how this information correlates with performance in downstream learning tasks. We tackle this question by studying whether embeddings can be inverted to (approximately) recover the graph used to generate them. Focusing on a variant of the popular DeepWalk method [Perozzi et al., 2014, Qiu et al., 2018], we present algorithms for accurate embedding inversion-i.e., from the low-dimensional embedding of a graph G, we can find a graph˜Ggraph˜ graph˜G with a very similar embedding. We perform numerous experiments on real-world networks, observing that significant information about G, such as specific edges and bulk properties like triangle density, is often lost iñ G. However, community structure is often preserved or even enhanced. Our findings are a step towards a more rigorous understanding of exactly what information embeddings encode about the input graph, and why this information is useful for learning tasks.}
}

@article{chaudhariEntropysgdBiasingGradient2019,
  title = {Entropy-Sgd: {{Biasing}} Gradient Descent into Wide Valleys},
  author = {Chaudhari, Pratik and Choromanska, Anna and Soatto, Stefano and LeCun, Yann and Baldassi, Carlo and Borgs, Christian and Chayes, Jennifer and Sagun, Levent and Zecchina, Riccardo},
  date = {2019},
  journaltitle = {Journal of Statistical Mechanics: Theory and Experiment},
  volume = {2019},
  number = {12},
  pages = {124018},
  publisher = {{IOP Publishing}},
  isbn = {1742-5468},
  file = {/Users/ryedida/Zotero/storage/RAIQJKJC/Chaudhari et al_2019_Entropy-sgd.pdf}
}

@article{chawla2002smote,
  title = {{{SMOTE}}: Synthetic Minority over-Sampling Technique},
  author = {Chawla, Nitesh V and Bowyer, Kevin W and Hall, Lawrence O and Kegelmeyer, W Philip},
  date = {2002},
  journaltitle = {Journal of artificial intelligence research},
  volume = {16},
  pages = {321--357}
}

@unpublished{Chen2018,
  title = {High {{Resolution Face Completion}} with {{Multiple Controllable Attributes}} via {{Fully End-to-End Progressive Generative Adversarial Networks}}},
  author = {Chen, Zeyuan and Nie, Shaoliang and Wu, Tianfu and Healey, Christopher G.},
  date = {2018},
  volume = {1},
  number = {1},
  eprint = {1801.07632},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1801.07632},
  abstract = {We present a deep learning approach for high resolution face completion with multiple controllable attributes (e.g., male and smiling) under arbitrary masks. Face completion entails understanding both structural meaningfulness and appearance consistency locally and globally to fill in "holes" whose content do not appear elsewhere in an input image. It is a challenging task with the difficulty level increasing significantly with respect to high resolution, the complexity of "holes" and the controllable attributes of filled-in fragments. Our system addresses the challenges by learning a fully end-to-end framework that trains generative adversarial networks (GANs) progressively from low resolution to high resolution with conditional vectors encoding controllable attributes. We design novel network architectures to exploit information across multiple scales effectively and efficiently. We introduce new loss functions encouraging sharp completion. We show that our system can complete faces with large structural and appearance variations using a single feed-forward pass of computation with mean inference time of 0.007 seconds for images at 1024 x 1024 resolution. We also perform a pilot human study that shows our approach outperforms state-of-the-art face completion methods in terms of rank analysis. The code will be released upon publication.},
  file = {/Users/ryedida/Zotero/storage/R7JXATW9/Chen et al. - 2018 - High Resolution Face Completion with Multiple Controllable Attributes via Fully End-to-End Progressive Generativ(2).pdf}
}

@inproceedings{chen2018applications,
  title = {Applications of Psychological Science for Actionable Analytics},
  booktitle = {Proceedings of the 2018 26th {{ACM Joint Meeting}} on {{European Software Engineering Conference}} and {{Symposium}} on the {{Foundations}} of {{Software Engineering}}},
  author = {Chen, Di and Fu, Wei and Krishna, Rahul and Menzies, Tim},
  date = {2018},
  pages = {456--467}
}

@article{chen2019mining,
  title = {Mining Likely Analogical Apis across Third-Party Libraries via Large-Scale Unsupervised Api Semantics Embedding},
  author = {Chen, Chunyang and Xing, Zhenchang and Liu, Yang and Ong, Kent Long Xiong},
  date = {2019},
  journaltitle = {IEEE Transactions on Software Engineering},
  publisher = {{IEEE}}
}

@article{Cheng2017,
  title = {Maximum Resilience of Artificial Neural Networks},
  author = {Cheng, Chih Hong and Nührenberg, Georg and Ruess, Harald},
  date = {2017},
  journaltitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  volume = {10482 LNCS},
  eprint = {1705.01040},
  eprinttype = {arxiv},
  pages = {251--268},
  issn = {16113349},
  doi = {10.1007/978-3-319-68167-2_18},
  abstract = {The deployment of Artificial Neural Networks (ANNs) in safety-critical applications poses a number of new verification and certification challenges. In particular, for ANN-enabled self-driving vehicles it is important to establish properties about the resilience of ANNs to noisy or even maliciously manipulated sensory input. We are addressing these challenges by defining resilience properties of ANN-based classifiers as the maximum amount of input or sensor perturbation which is still tolerated. This problem of computing maximum perturbation bounds for ANNs is then reduced to solving mixed integer optimization problems (MIP). A number of MIP encoding heuristics are developed for drastically reducing MIP-solver runtimes, and using parallelization of MIP-solvers results in an almost linear speed-up in the number (up to a certain limit) of computing cores in our experiments. We demonstrate the effectiveness and scalability of our approach by means of computing maximum resilience bounds for a number of ANN benchmark sets ranging from typical image recognition scenarios to the autonomous maneuvering of robots.},
  isbn = {9783319681665}
}

@inproceedings{chenObjectDetectionGraphical2020,
  title = {Object Detection for Graphical User Interface: Old Fashioned or Deep Learning or a Combination?},
  shorttitle = {Object Detection for Graphical User Interface},
  booktitle = {Proceedings of the 28th {{ACM Joint Meeting}} on {{European Software Engineering Conference}} and {{Symposium}} on the {{Foundations}} of {{Software Engineering}}},
  author = {Chen, Jieshan and Xie, Mulong and Xing, Zhenchang and Chen, Chunyang and Xu, Xiwei and Zhu, Liming and Li, Guoqiang},
  date = {2020-11-08},
  pages = {1202--1214},
  publisher = {{ACM}},
  location = {{Virtual Event USA}},
  doi = {10.1145/3368089.3409691},
  url = {https://dl.acm.org/doi/10.1145/3368089.3409691},
  urldate = {2021-11-15},
  abstract = {Detecting Graphical User Interface (GUI) elements in GUI images is a domain-specific object detection task. It supports many software engineering tasks, such as GUI animation and testing, GUI search and code generation. Existing studies for GUI element detection directly borrow the mature methods from computer vision (CV) domain, including old fashioned ones that rely on traditional image processing features (e.g., canny edge, contours), and deep learning models that learn to detect from large-scale GUI data. Unfortunately, these CV methods are not originally designed with the awareness of the unique characteristics of GUIs and GUI elements and the high localization accuracy of the GUI element detection task. We conduct the first large-scale empirical study of seven representative GUI element detection methods on over 50k GUI images to understand the capabilities, limitations and effective designs of these methods. This study not only sheds the light on the technical challenges to be addressed but also informs the design of new GUI element detection methods. We accordingly design a new GUI-specific oldfashioned method for non-text GUI element detection which adopts a novel top-down coarse-to-fine strategy, and incorporate it with the mature deep learning model for GUI text detection. Our evaluation on 25,000 GUI images shows that our method significantly advances the start-of-the-art performance in GUI element detection.},
  eventtitle = {{{ESEC}}/{{FSE}} '20: 28th {{ACM Joint European Software Engineering Conference}} and {{Symposium}} on the {{Foundations}} of {{Software Engineering}}},
  isbn = {978-1-4503-7043-1},
  langid = {english},
  annotation = {21 citations (Semantic Scholar/DOI) [2021-11-15]},
  file = {/Users/ryedida/Zotero/storage/DZYLUQUM/Chen et al. - 2020 - Object detection for graphical user interface old.pdf}
}

@article{chenROBUSTOVERFITTINGMAY2021,
  title = {{{ROBUST OVERFITTING MAY BE MITIGATED BY PROP- ERLY LEARNED SMOOTHENING}}},
  author = {Chen, Tianlong and Zhang, Zhenyu and Liu, Sijia and Chang, Shiyu and Wang, Zhangyang},
  date = {2021},
  pages = {19},
  abstract = {A recent study (Rice et al., 2020) revealed overfitting to be a dominant phenomenon in adversarially robust training of deep networks, and that appropriate early-stopping of adversarial training (AT) could match the performance gains of most recent algorithmic improvements. This intriguing problem of robust overfitting motivates us to seek more remedies. As a pilot study, this paper investigates two empirical means to inject more learned smoothening during AT: one leveraging knowledge distillation and self-training to smooth the logits, the other performing stochastic weight averaging (Izmailov et al., 2018) to smooth the weights. Despite the embarrassing simplicity, the two approaches are surprisingly effective and hassle-free in mitigating robust overfitting. Experiments demonstrate that by plugging in them to AT, we can simultaneously boost the standard accuracy by 3.72\% ∼ 6.68\% and robust accuracy by 0.22\% ∼ 2.03\%, across multiple datasets (STL-10, SVHN, CIFAR-10, CIFAR-100, and Tiny ImageNet), perturbation types ( ∞ and 2), and robustified methods (PGD, TRADES, and FSGM), establishing the new state-of-the-art bar in AT. We present systematic visualizations and analyses to dive into their possible working mechanisms. We also carefully exclude the possibility of gradient masking by evaluating our models’ robustness against transfer attacks. Codes are available at https: //github.com/VITA-Group/Alleviate-Robust-Overfitting.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/D28EN32Y/Chen et al. - 2021 - ROBUST OVERFITTING MAY BE MITIGATED BY PROP- ERLY .pdf}
}

@article{Cheung2017,
  title = {Heterogeneous {{Features Integration}} in {{Deep Knowledge Tracing}}},
  author = {Cheung, Lap Pong and Yang, Haiqin},
  date = {2017},
  journaltitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  issn = {16113349},
  doi = {10.1007/978-3-319-70096-0_67},
  abstract = {Deep recurrent neural networks have been suc-cessfully applied to knowledge tracing, namely, deep knowledge tracing (DKT), which aims to automatically trace students' knowledge states by mining their exercise performance data. Two main issues exist in the current DKT models: First, the complexity of the DKT models increases the tension of psychological interpretation. Second, the input of existing DKT models is only the exercise tags rep-resenting via one-hot encoding. The correlation between the hidden knowledge components and students' responses to the exercises heavily relies on training the DKT models. The existing rich and informative features are excluded in the training, which may yield sub-optimal performance. To uti-lize the information embedded in these features, researchers have proposed a manual method to pre-process the features, i.e., discretizing them based on the inner characteristics of individual features. However, the proposed method requires many feature engineering efforts and is infeasible when the selected features are huge. To tackle the above issues, we design an automatic system to embed the heterogeneous fea-tures implicitly and effectively into the original DKT model. More specifically, we apply tree-based classifiers to predict whether the student can correctly answer the exercise given the heterogeneous features, an effective way to capture how the student deviates from others in the exercise. The pre-dicted response and the true response are then encoded into a 4-bit one-hot encoding and concatenated with the origi-nal one-hot encoding features on the exercise tags to train a long short-term memory (LSTM) model, which can output the probability that a student will answer the exercise cor-rectly on the corresponding exercise. We conduct a thorough evaluation on two educational datasets and demonstrate the merits and observations of our proposal.},
  isbn = {9783319700953}
}

@article{Cheung2018,
  title = {{{OReONet}}: {{Deep}} Convolutional Network for Oil Reservoir Optimization},
  author = {Cheung, Chung Ming and Goyal, Palash and Prasanna, Viktor K. and Tehrani, Arash Saber},
  date = {2018},
  journaltitle = {Proceedings - 2017 IEEE International Conference on Big Data, Big Data 2017},
  volume = {2018-Janua},
  pages = {1277--1282},
  doi = {10.1109/BigData.2017.8258055},
  abstract = {© 2017 IEEE. In recent years, deep convolutional networks have been successfully used for the tasks of image classification and speech recognition. The highly non-linear modeling combined with its emphasis on local connectivity makes them highly suitable for such tasks. However, their performance in other domains is not well explored. Specifically, in the oil industry, researchers use manual features from time series data as input to various machine learning models. In this paper, we employ deep convolutional autoencoders to extract non linear latent features from time series data. We propose a novel deep network architecture and show its efficacy in two oil field tasks related to reservoir optimization - steam job prediction and slippage detection. We show that our architecture outperforms state-of-the-art methods significantly on steam job prediction. We demonstrate the success of our model on an oil field dataset which consists of production and failure data of over two years. Our architecture achieves a precision of 98\% for precision@50 in steam job prediction, and 25\% improvement over the methods used in the industry. To the best of our knowledge, we are the first to attempt to automatically detect slippage failures in well pumps. We are able to classify slippage events with 70.3\% accuracy, a 10.6\% improvement over using manually defined input features.},
  isbn = {9781538627143},
  keywords = {Convolutional Autoencoder,Machine Learning,Oil Reservoir Optimization}
}

@online{choQueryingEasilyFlipflopped2024,
  title = {Querying {{Easily Flip-flopped Samples}} for {{Deep Active Learning}}},
  author = {Cho, Seong Jin and Kim, Gwangsu and Lee, Junghyun and Shin, Jinwoo and Yoo, Chang D.},
  date = {2024-01-18},
  eprint = {2401.09787},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/2401.09787},
  urldate = {2024-01-22},
  abstract = {Active learning is a machine learning paradigm that aims to improve the performance of a model by strategically selecting and querying unlabeled data. One effective selection strategy is to base it on the model’s predictive uncertainty, which can be interpreted as a measure of how informative a sample is. The sample’s distance to the decision boundary is a natural measure of predictive uncertainty, but it is often intractable to compute, especially for complex decision boundaries formed in multiclass classification tasks. To address this issue, this paper proposes the least disagree metric (LDM), defined as the smallest probability of disagreement of the predicted label, and an estimator for LDM proven to be asymptotically consistent under mild assumptions. The estimator is computationally efficient and can be easily implemented for deep learning models using parameter perturbation. The LDM-based active learning is performed by querying unlabeled data with the smallest LDM. Experimental results show that our LDM-based active learning algorithm obtains state-of-the-art overall performance on all considered datasets and deep architectures.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/9VK5PMTR/Cho et al. - 2024 - Querying Easily Flip-flopped Samples for Deep Acti.pdf}
}

@article{choromanskaLossSurfacesMultilayer,
  title = {The {{Loss Surfaces}} of {{Multilayer Networks}}},
  author = {Choromanska, Anna and Henaff, Mikael and Mathieu, Michael and Arous, Gerard Ben and LeCun, Yann},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/LD8Q9ID8/Choromanska et al. - The Loss Surfaces of Multilayer Networks.pdf}
}

@inproceedings{commitguru,
  title = {Commit {{Guru}}: {{Analytics}} and {{Risk Prediction}} of {{Software Commits}}},
  author = {Rosen, C and Grawi, B and Shihab, E},
  date = {2015},
  keywords = {Risky Software Commits,Software Analytics,Software Metrics,Software Prediction}
}

@article{Concept2018,
  title = {Tcav : {{R Elative Concept Importance Testing}}},
  author = {Concept, Elative and Testing, Importance},
  date = {2018},
  journaltitle = {Iclr2018},
  issue = {Ml},
  file = {/Users/ryedida/Zotero/storage/B36KGX7B/Concept, Testing - 2018 - Tcav R Elative Concept Importance Testing(2).pdf}
}

@report{Corbetr1995,
  title = {Knowledge {{Tracing}}: {{Modeling}} the {{Acquisition}} of {{Procedural Knowledge}}},
  author = {Corbetr, Albert T and Anderson, John R},
  date = {1995},
  journaltitle = {User Modeling and User-Adapted Interaction},
  volume = {4},
  pages = {253--278},
  institution = {{Kluwer Academic Publishers}},
  abstract = {This paper describes an effort to model students' changing knowledge state during skill acquisition. Students in this research are learning to write short programs with the ACT Programming Tutor (APT). APT is constructed around a production rule cognitive model of programming knowledge, called the ideal student model. This model allows the tutor to solve exercises along with the student and provide assistance as necessary. As the student works, the tutor also maintains an estimate of the probability that the student has learned each of the rules in the ideal model, in a process called knowledge tracing. The tutor presents an individualized sequence of exercises to the student based on these probability estimates until the student has 'mastered' each rule. The programming tutor, cognitive model and learning and performance assumptions are described. A series of studies is reviewed that examine the empirical validity of knowledge tracing and has led to modifications in the process. Currently the model is quite successful in predicting test performance. Further modifications in the modeling process are discussed that may improve performance levels.},
  keywords = {empirical validity,individual differences,intelligent tutoring systems,learning,mastery learning,procedural knowledge,Student modeling},
  file = {/Users/ryedida/Zotero/storage/FJSKXAUI/Corbetr, Anderson - 1995 - Knowledge Tracing Modeling the Acquisition of Procedural Knowledge(2).pdf}
}

@article{cowen2022hebo,
  title = {{{HEBO}}: Pushing the Limits of Sample-Efficient Hyper-Parameter Optimisation},
  author = {Cowen-Rivers, Alexander I and Lyu, Wenlong and Tutunov, Rasul and Wang, Zhi and Grosnit, Antoine and Griffiths, Ryan Rhys and Maraval, Alexandre Max and Jianye, Hao and Wang, Jun and Peters, Jan and others},
  date = {2022},
  journaltitle = {Journal of Artificial Intelligence Research},
  volume = {74},
  pages = {1269--1349},
  file = {/Users/ryedida/Zotero/storage/3272ZMB9/Cowen-Rivers et al_2022_HEBO.pdf}
}

@article{Croce2019,
  title = {A {{Randomized Gradient-Free Attack}} on {{ReLU Networks}}},
  author = {Croce, Francesco and Hein, Matthias},
  date = {2019},
  journaltitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  volume = {11269 LNCS},
  eprint = {1811.11493},
  eprinttype = {arxiv},
  pages = {215--227},
  issn = {16113349},
  doi = {10.1007/978-3-030-12939-2_16},
  abstract = {It has recently been shown that neural networks but also other classifiers are vulnerable to so called adversarial attacks e.g. in object recognition an almost non-perceivable change of the image changes the decision of the classifier. Relatively fast heuristics have been proposed to produce these adversarial inputs but the problem of finding the optimal adversarial input, that is with the minimal change of the input, is NP-hard. While methods based on mixed-integer optimization which find the optimal adversarial input have been developed, they do not scale to large networks. Currently, the attack scheme proposed by Carlini and Wagner is considered to produce the best adversarial inputs. In this paper we propose a new attack scheme for the class of ReLU networks based on a direct optimization on the resulting linear regions. In our experimental validation we improve in all except one experiment out of 18 over the Carlini-Wagner attack with a relative improvement of up to 9\%. As our approach is based on the geometrical structure of ReLU networks, it is less susceptible to defences targeting their functional properties.},
  isbn = {9783030129385},
  keywords = {Adversarial manipulation,Robustness of classifiers},
  file = {/Users/ryedida/Zotero/storage/5LU8HF6V/Croce, Hein - 2019 - A Randomized Gradient-Free Attack on ReLU Networks(2).pdf}
}

@article{croceMinimallyDistortedAdversarial,
  title = {Minimally Distorted {{Adversarial Examples}} with a {{Fast Adaptive Boundary Attack}}},
  author = {Croce, Francesco and Hein, Matthias},
  pages = {24},
  abstract = {The evaluation of robustness against adversarial manipulation of neural networks-based classifiers is mainly tested with empirical attacks as methods for the exact computation, even when available, do not scale to large networks. We propose in this paper a new white-box adversarial attack wrt the lp-norms for p ∈ \{1, 2, ∞\} aiming at finding the minimal perturbation necessary to change the class of a given input. It has an intuitive geometric meaning, yields quickly high quality results, minimizes the size of the perturbation (so that it returns the robust accuracy at every threshold with a single run). It performs better or similar to stateof-the-art attacks which are partially specialized to one lp-norm, and is robust to the phenomenon of gradient masking.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/8NIEESKZ/Croce and Hein - Minimally distorted Adversarial Examples with a Fa.pdf}
}

@unpublished{croceReliableEvaluationAdversarial2020,
  title = {Reliable Evaluation of Adversarial Robustness with an Ensemble of Diverse Parameter-Free Attacks},
  author = {Croce, Francesco and Hein, Matthias},
  date = {2020-08-04},
  eprint = {2003.01690},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/2003.01690},
  urldate = {2021-04-16},
  abstract = {The field of defense strategies against adversarial attacks has significantly grown over the last years, but progress is hampered as the evaluation of adversarial defenses is often insufficient and thus gives a wrong impression of robustness. Many promising defenses could be broken later on, making it difficult to identify the state-of-the-art. Frequent pitfalls in the evaluation are improper tuning of hyperparameters of the attacks, gradient obfuscation or masking. In this paper we first propose two extensions of the PGD-attack overcoming failures due to suboptimal step size and problems of the objective function. We then combine our novel attacks with two complementary existing ones to form a parameter-free, computationally affordable and user-independent ensemble of attacks to test adversarial robustness. We apply our ensemble to over 50 models from papers published at recent top machine learning and computer vision venues. In all except one of the cases we achieve lower robust test accuracy than reported in these papers, often by more than 10\%, identifying several broken defenses.},
  langid = {english},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Statistics - Machine Learning},
  annotation = {90 citations (Semantic Scholar/arXiv) [2021-04-16]},
  file = {/Users/ryedida/Zotero/storage/RFKJ5WTA/Croce and Hein - 2020 - Reliable evaluation of adversarial robustness with.pdf}
}

@article{cybenkoApproximationSuperpositionsSigmoidal1989,
  title = {Approximation by Superpositions of a Sigmoidal Function},
  author = {Cybenko, George},
  date = {1989},
  journaltitle = {Mathematics of control, signals and systems},
  volume = {2},
  number = {4},
  pages = {303--314},
  publisher = {{Springer}},
  keywords = {approximation,completeness,neural networks},
  file = {/Users/ryedida/Zotero/storage/S9GFQ3IK/Cybenkot - 1989 - Approximation by Superpositions of a Sigmoidal Function(2).pdf}
}

@article{dablainDeepSMOTEFusingDeep2022,
  title = {{{DeepSMOTE}}: {{Fusing Deep Learning}} and {{SMOTE}} for {{Imbalanced Data}}},
  shorttitle = {{{DeepSMOTE}}},
  author = {Dablain, Damien and Krawczyk, Bartosz and Chawla, Nitesh V.},
  date = {2022},
  journaltitle = {IEEE Transactions on Neural Networks and Learning Systems},
  pages = {1--15},
  issn = {2162-2388},
  doi = {10.1109/TNNLS.2021.3136503},
  abstract = {Despite over two decades of progress, imbalanced data is still considered a significant challenge for contemporary machine learning models. Modern advances in deep learning have further magnified the importance of the imbalanced data problem, especially when learning from images. Therefore, there is a need for an oversampling method that is specifically tailored to deep learning models, can work on raw images while preserving their properties, and is capable of generating high-quality, artificial images that can enhance minority classes and balance the training set. We propose Deep synthetic minority oversampling technique (SMOTE), a novel oversampling algorithm for deep learning models that leverages the properties of the successful SMOTE algorithm. It is simple, yet effective in its design. It consists of three major components: 1) an encoder/decoder framework; 2) SMOTE-based oversampling; and 3) a dedicated loss function that is enhanced with a penalty term. An important advantage of DeepSMOTE over generative adversarial network (GAN)-based oversampling is that DeepSMOTE does not require a discriminator, and it generates high-quality artificial images that are both information-rich and suitable for visual inspection. DeepSMOTE code is publicly available at https://github.com/dd1github/DeepSMOTE.},
  eventtitle = {{{IEEE Transactions}} on {{Neural Networks}} and {{Learning Systems}}},
  keywords = {Class imbalance,Data models,deep learning,Deep learning,Image reconstruction,Inspection,Learning systems,machine learning,oversampling,synthetic minority oversampling technique (SMOTE).,Training,Visualization},
  annotation = {35 citations (Semantic Scholar/DOI) [2023-04-06]},
  file = {/Users/ryedida/Zotero/storage/4KYEQSH8/Dablain et al. - 2022 - DeepSMOTE Fusing Deep Learning and SMOTE for Imba.pdf;/Users/ryedida/Zotero/storage/JULGC935/stamp.html}
}

@article{dasgupta1992power,
  title = {The Power of Approximating: A Comparison of Activation Functions},
  author = {DasGupta, Bhaskar and Schnitger, Georg},
  date = {1992},
  journaltitle = {Advances in neural information processing systems},
  volume = {5}
}

@article{dauphinIdentifyingAttackingSaddle,
  title = {Identifying and Attacking the Saddle Point Problem in High-Dimensional Non-Convex Optimization},
  author = {Dauphin, Yann N and Pascanu, Razvan and Gulcehre, Caglar and Cho, Kyunghyun and Ganguli, Surya and Bengio, Yoshua},
  abstract = {A central challenge to many fields of science and engineering involves minimizing non-convex error functions over continuous, high dimensional spaces. Gradient descent or quasi-Newton methods are almost ubiquitously used to perform such minimizations, and it is often thought that a main source of difficulty for these local methods to find the global minimum is the proliferation of local minima with much higher error than the global minimum. Here we argue, based on results from statistical physics, random matrix theory, neural network theory, and empirical evidence, that a deeper and more profound difficulty originates from the proliferation of saddle points, not local minima, especially in high dimensional problems of practical interest. Such saddle points are surrounded by high error plateaus that can dramatically slow down learning, and give the illusory impression of the existence of a local minimum. Motivated by these arguments, we propose a new approach to second-order optimization, the saddle-free Newton method, that can rapidly escape high dimensional saddle points, unlike gradient descent and quasi-Newton methods. We apply this algorithm to deep or recurrent neural network training, and provide numerical evidence for its superior optimization performance.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/E7X3FM48/Dauphin et al. - Identifying and attacking the saddle point problem.pdf}
}

@inproceedings{David2016,
  title = {Sequencing Educational Content in Classrooms Using {{Bayesian}} Knowledge Tracing},
  booktitle = {Proceedings of the {{Sixth International Conference}} on {{Learning Analytics}} \& {{Knowledge}} - {{LAK}} '16},
  author = {David, Yossi Ben and Segal, Avi and Gal, Ya'akov (Kobi)},
  date = {2016},
  eprint = {1508.06655v1},
  eprinttype = {arxiv},
  issn = {9781450321389},
  doi = {10.1145/2883851.2883885},
  abstract = {Sina Weibo, China's most popular microblogging platform, is currently used by over \$500M\$ users and is considered to be a proxy of Chinese social life. In this study, we contrast the discussions occurring on Sina Weibo and on Chinese language Twitter in order to observe two different strands of Chinese culture: people within China who use Sina Weibo with its government imposed restrictions and those outside that are free to speak completely anonymously. We first propose a simple ad-hoc algorithm to identify topics of Tweets and Weibo. Different from previous works on micro-message topic detection, our algorithm considers topics of the same contents but with different \textbackslash\#tags. Our algorithm can also detect topics for Tweets and Weibos without any \textbackslash\#tags. Using a large corpus of Weibo and Chinese language tweets, covering the period from January \$1\$ to December \$31\$, \$2012\$, we obtain a list of topics using clustered \textbackslash\#tags that we can then use to compare the two platforms. Surprisingly, we find that there are no common entries among the Top \$100\$ most popular topics. Furthermore, only \$9.2\textbackslash\%\$ of tweets correspond to the Top \$1000\$ topics on Sina Weibo platform, and conversely only \$4.4\textbackslash\%\$ of weibos were found to discuss the most popular Twitter topics. Our results reveal significant differences in social attention on the two platforms, with most popular topics on Sina Weibo relating to entertainment while most tweets corresponded to cultural or political contents that is practically non existent in Sina Weibo.},
  isbn = {978-1-4503-4190-5},
  file = {/Users/ryedida/Zotero/storage/BHFA8SMC/David, Segal, Gal - 2016 - Sequencing educational content in classrooms using Bayesian knowledge tracing(2).pdf}
}

@article{deb2005evaluating,
  title = {Evaluating the \$\textbackslash varepsilon\$-Domination Based Multi-Objective Evolutionary Algorithm for a Quick Computation of {{Pareto-optimal}} Solutions},
  author = {Deb, Kalyanmoy and Mohan, Manikanth and Mishra, Shikhar},
  date = {2005},
  journaltitle = {Evolutionary computation},
  volume = {13},
  number = {4},
  pages = {501--525},
  publisher = {{MIT Press}}
}

@article{defazioSAGAFastIncremental,
  title = {{{SAGA}}: {{A Fast Incremental Gradient Method With Support}} for {{Non-Strongly Convex Composite Objectives}}},
  author = {Defazio, Aaron and Bach, Francis and Lacoste-Julien, Simon},
  abstract = {In this work we introduce a new optimisation method called SAGA in the spirit of SAG, SDCA, MISO and SVRG, a set of recently proposed incremental gradient algorithms with fast linear convergence rates. SAGA improves on the theory behind SAG and SVRG, with better theoretical convergence rates, and has support for composite objectives where a proximal operator is used on the regulariser. Unlike SDCA, SAGA supports non-strongly convex problems directly, and is adaptive to any inherent strong convexity of the problem. We give experimental results showing the effectiveness of our method.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/WS935C8E/Defazio et al. - SAGA A Fast Incremental Gradient Method With Supp.pdf}
}

@article{defossezSimpleConvergenceProof,
  title = {A {{Simple Convergence Proof}} of {{Adam}} and {{Adagrad}}},
  author = {Défossez, Alexandre and Bottou, Léon and Bach, Francis and Usunier, Nicolas},
  abstract = {We provide a simple proof of convergence covering both the Adam and Adagrad adaptive optimization algorithms when applied to smooth (possibly non-convex) objective functions with bounded gradients. We show that in expectation, the squared norm of the objective gradient averaged over the trajectory has an upper-bound which is explicit in the constants of the problem, parameters of the optimizer, the dimension d, and the total number of iterations N . This bound can be made arbitrarily small, and with the right hyper-p√arameters, Adam can be shown to converge with the same rate of convergence O(d ln(N )/ N ). When used with the default parameters, Adam doesn’t converge, however, and just like constant stepsize SGD, it moves away from the initialization point faster than Adagrad, which might explain its practical success. Finally, we obtain the tightest dependency on the heavy ball momentum decay rate β1 among all previous convergence bounds for non-convex Adam and Adagrad, improving from O((1 − β1)−3) to O((1 − β1)−1).},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/RCFV4I3U/Défossez et al. - A Simple Convergence Proof of Adam and Adagrad.pdf}
}

@article{Dhillon2018,
  title = {Stochastic Activation Pruning for Robust Adversarial Defense},
  author = {Dhillon, Guneet S. and Azizzadenesheli, Kamyar and Lipton, Zachary C. and Bernstein, Jeremy and Kossaifi, Jean and Khanna, Aran and Anandkumar, Anima},
  date = {2018},
  journaltitle = {6th International Conference on Learning Representations, ICLR 2018 - Conference Track Proceedings},
  eprint = {1803.01442},
  eprinttype = {arxiv},
  pages = {1--13},
  abstract = {Neural networks are known to be vulnerable to adversarial examples. Carefully chosen perturbations to real images, while imperceptible to humans, induce misclassification and threaten the reliability of deep learning systems in the wild. To guard against adversarial examples, we take inspiration from game theory and cast the problem as a minimax zero-sum game between the adversary and the model. In general, for such games, the optimal strategy for both players requires a stochastic policy, also known as a mixed strategy. In this light, we propose Stochastic Activation Pruning (SAP), a mixed strategy for adversarial defense. SAP prunes a random subset of activations (preferentially pruning those with smaller magnitude) and scales up the survivors to compensate. We can apply SAP to pretrained networks, including adversarially trained models, without fine-tuning, providing robustness against adversarial examples. Experiments demonstrate that SAP confers robustness against attacks, increasing accuracy and preserving calibration.},
  file = {/Users/ryedida/Zotero/storage/YPDJBL28/Dhillon et al. - 2018 - Stochastic activation pruning for robust adversarial defense(2).pdf}
}

@article{dingLearningGeneralizableCode2023,
  title = {Towards {{Learning Generalizable Code Embeddings Using Task-agnostic Graph Convolutional Networks}}},
  author = {Ding, Zishuo and Li, Heng and Shang, Weiyi and Chen, Tse-Hsun (Peter)},
  date = {2023-04-30},
  journaltitle = {ACM Transactions on Software Engineering and Methodology},
  shortjournal = {ACM Trans. Softw. Eng. Methodol.},
  volume = {32},
  number = {2},
  pages = {1--43},
  issn = {1049-331X, 1557-7392},
  doi = {10.1145/3542944},
  url = {https://dl.acm.org/doi/10.1145/3542944},
  urldate = {2023-10-06},
  abstract = {Code embeddings have seen increasing applications in software engineering (SE) research and practice recently. Despite the advances in embedding techniques applied in SE research, one of the main challenges is their generalizability. A recent study finds that code embeddings may not be readily leveraged for the downstream tasks that the embeddings are not particularly trained for. Therefore, in this article, we propose               GraphCodeVec               , which represents the source code as graphs and leverages the Graph Convolutional Networks to learn more generalizable code embeddings in a task-agnostic manner. The edges in the graph representation are automatically constructed from the paths in the abstract syntax trees, and the nodes from the tokens in the source code. To evaluate the effectiveness of               GraphCodeVec               , we consider three downstream benchmark tasks (i.e., code comment generation, code authorship identification, and code clones detection) that are used in a prior benchmarking of code embeddings and add three new downstream tasks (i.e., source code classification, logging statements prediction, and software defect prediction), resulting in a total of six downstream tasks that are considered in our evaluation. For each downstream task, we apply the embeddings learned by               GraphCodeVec               and the embeddings learned from four baseline approaches and compare their respective performance. We find that               GraphCodeVec               outperforms all the baselines in five out of the six downstream tasks, and its performance is relatively stable across different tasks and datasets. In addition, we perform ablation experiments to understand the impacts of the training context (i.e., the graph context extracted from the abstract syntax trees) and the training model (i.e., the Graph Convolutional Networks) on the effectiveness of the generated embeddings. The results show that both the graph context and the Graph Convolutional Networks can benefit               GraphCodeVec               in producing high-quality embeddings for the downstream tasks, while the improvement by Graph Convolutional Networks is more robust across different downstream tasks and datasets. Our findings suggest that future research and practice may consider using graph-based deep learning methods to capture the structural information of the source code for SE tasks.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/XVV3V2L3/Ding et al. - 2023 - Towards Learning Generalizable Code Embeddings Usi.pdf}
}

@inproceedings{domhan2015speeding,
  title = {Speeding up Automatic Hyperparameter Optimization of Deep Neural Networks by Extrapolation of Learning Curves},
  booktitle = {Twenty-Fourth International Joint Conference on Artificial Intelligence},
  author = {Domhan, Tobias and Springenberg, Jost Tobias and Hutter, Frank},
  date = {2015}
}

@article{Dong2013,
  title = {Modeling Functional Roles Dynamics in Small Group Interactions},
  author = {Dong, Wen and Lepri, Bruno and Pianesi, Fabio and Pentland, Alex},
  date = {2013},
  journaltitle = {IEEE Transactions on Multimedia},
  volume = {15},
  number = {1},
  pages = {83--95},
  issn = {15209210},
  doi = {10.1109/TMM.2012.2225039},
  abstract = {The paper addresses the automatic recognition of so- cial and task-oriented functional roles in small-groupmeetings, fo- cusing on several properties: a) the importance of non-linguistic behaviors, b) the relative time-consistency of the social roles played by a given person during the course of a meeting, and c) the inter- plays and mutual constraints among the roles enacted by the dif- ferent participants in a social encounter. In particular, this paper proposes that the Influence Model framework can address these properties of functional roles, and compares the performance ob- tained by this framework to the performances of models that con- sider only property (a) (SVM), and to those that address both (a) and (b) (HMM). The results obtained confirm our expectations: the classification of social functional roles improves if models ac- count for temporal dependencies among the roles played by the same subject, for the time properties of the roles played by each individual, and for the mutual constraints among the roles of dif- ferent group members. The two versions of the Influence Model (IM and newIM), which encode all three properties together, out- perform both the SVM and the HMM on most of the figures of merit used. Of particular interest is the capability of the Influence Model to obtain good or very good results on the less-populated classes—Orienteer and Seeker for the task area, and Attacker and Supporter for the socio-emotional area.},
  isbn = {9781457711022},
  keywords = {Functional roles,influence model,multimodal analysis,non-linguistic behavior},
  file = {/Users/ryedida/Zotero/storage/D2Y5ENZI/Dong et al. - 2013 - Modeling functional roles dynamics in small group interactions(2).pdf}
}

@article{dongDeKeDVerDeepLearningbased2023,
  title = {{{DeKeDVer}}: {{A}} Deep Learning-Based Multi-Type Software Vulnerability Classification Framework Using Vulnerability Description and Source Code},
  shorttitle = {{{DeKeDVer}}},
  author = {Dong, Yukun and Tang, Yeer and Cheng, Xiaotong and Yang, Yufei},
  date = {2023-11-01},
  journaltitle = {Information and Software Technology},
  shortjournal = {Information and Software Technology},
  volume = {163},
  pages = {107290},
  issn = {0950-5849},
  doi = {10.1016/j.infsof.2023.107290},
  url = {https://www.sciencedirect.com/science/article/pii/S0950584923001441},
  urldate = {2023-10-06},
  abstract = {Context: Software vulnerabilities have confused software developers for a long time. Vulnerability classification is thus crucial, through which we can know the specific type of vulnerability and then conduct targeted repair. Stack of papers have looked into deep learning-based multi-type vulnerability classification, among which most are based on vulnerability descriptions and some are based on source code. While vulnerability descriptions can sometimes mislead vulnerability classification and source code-based approaches have been rarely explored in multi-type vulnerability classification. Objective: We design DeKeDVer (Vulnerability Descriptions and Key Domain based Vulnerability Classifier) with two objectives: (i) to extract more useful information from vulnerability descriptions; (ii) to better utilize the information source code can reflect. Method: In this work, we propose a multi-type vulnerability classifier which combine vulnerability descriptions and source code together. We process vulnerability descriptions and source code of each project separately. For the vulnerability description of a sample, we preprocess it using a specified way we design based on our observations on numerous descriptions and then select text features. After that, Text Recurrent Convolutional Neural Network (TextRCNN) is applied to learn text information. For source code, we leverage its Code Property Graph (CPG) and extract key domain from it which are then embedded. Acquired feature vectors are then fed into Relational Graph Attention Network (RGAT). Result vectors gained from TextRCNN and RGAT are combined together as the feature vector of the current sample. A Multi-Layer Perceptron (MLP) layer is further added to undertake classification. Results: We conduct our experiments on C/C++ projects from NVD. Experimental results show that our work achieves 84.49\% in weighted F1-measure which proves our work to be more effective. Conclusion: Our work utilizes information reflected both from vulnerability descriptions and source code to facilitate vulnerability classification and achieves higher weighted F1-measure than existing vulnerability classification tools.},
  keywords = {Multi-type vulnerability classification,Relational graph attention network,Source code,Text Recurrent Convolutional Neural Network,Vulnerability description},
  file = {/Users/ryedida/Zotero/storage/BGCBYSXL/Dong et al_2023_DeKeDVer.pdf;/Users/ryedida/Zotero/storage/7VZZK23A/S0950584923001441.html}
}

@incollection{Doshi-Velez2018,
  title = {Considerations for {{Evaluation}} and {{Generalization}} in {{Interpretable Machine Learning}}},
  author = {Doshi-Velez, Finale and Kim, Been},
  date = {2018},
  pages = {3--17},
  doi = {10.1007/978-3-319-98131-4_1},
  abstract = {As machine learning systems become ubiquitous, there has been a surge of interest in interpretable machine learning: systems that provide explanation for their outputs. These explanations are often used to qualitatively assess other criteria such as safety or non-discrimination. However, despite the interest in interpretability, there is very little consensus on what interpretable machine learning is and how it should be measured. In this position paper, we first define interpretability and describe when interpretability is needed (and when it is not). Next, we suggest a taxonomy for rigorous evaluation and expose open questions towards a more rigorous science of interpretable machine learning.}
}

@article{douTurBOCostefficientConfigurationbased2023,
  title = {{{TurBO}}: {{A}} Cost-Efficient Configuration-Based Auto-Tuning Approach for Cluster-Based Big Data Frameworks},
  shorttitle = {{{TurBO}}},
  author = {Dou, Hui and Zhang, Lei and Zhang, Yiwen and Chen, Pengfei and Zheng, Zibin},
  date = {2023-07-01},
  journaltitle = {Journal of Parallel and Distributed Computing},
  shortjournal = {Journal of Parallel and Distributed Computing},
  volume = {177},
  pages = {89--105},
  issn = {0743-7315},
  doi = {10.1016/j.jpdc.2023.03.002},
  url = {https://www.sciencedirect.com/science/article/pii/S0743731523000382},
  urldate = {2024-01-15},
  abstract = {Big data processing frameworks such as Spark usually provide a large number of performance-related configuration parameters, how to auto-tune these parameters for a better performance has been a hot issue in academia as well as industry for years. Through delicately tradeoff between exploration and exploitation, Bayesian Optimization (BO) is currently the most appealing algorithm to achieve configuration auto-tuning. However, considering the tuning cost constraint in practice, there are three critical limitations preventing conventional BO-based approaches from being directly applied into auto-tuning cluster-based big data frameworks. In this paper, we propose a cost-efficient configuration auto-tuning approach named TurBO for big data frameworks based on two enhancements of vanilla BO:1) To reduce the essential iteration times, TurBO integrates a well-designed adaptive pseudo point mechanism with BO; 2) To avoid the time-consuming practical evaluation of sub-optimal configurations as possible, TurBO leverages the proposed CASampling method to intelligently tackle with these sub-optimal configurations based on ensemble learning with historical tuning experiences. To evaluate the performance of TurBO, we conducted a series of experiments on a local Spark cluster with 9 different HiBench benchmark applications. Overall, compared with 3 representative BO-based baseline approaches OpenTuner, Bliss and ResTune, TurBO is able to speedup the tuning procedures respectively by 2.24×, 2.29× and 1.97× on average. Besides, TurBO can always achieve a positive cumulative performance gain under the simulated dynamic workload scenario, which means TurBO is indeed appropriate for workload changes of big data applications.},
  keywords = {Bayesian optimization,Big data framework,Configuration parameter,Pseudo point,Tuning cost},
  file = {/Users/ryedida/Zotero/storage/ZE3J2Q63/Dou et al_2023_TurBO.pdf}
}

@unpublished{Dreossi2017,
  title = {Systematic {{Testing}} of {{Convolutional Neural Networks}} for {{Autonomous Driving}}},
  author = {Dreossi, Tommaso and Ghosh, Shromona and Sangiovanni-Vincentelli, Alberto and Seshia, Sanjit A.},
  date = {2017},
  eprint = {1708.03309},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1708.03309},
  abstract = {We present a framework to systematically analyze convolutional neural networks (CNNs) used in classification of cars in autonomous vehicles. Our analysis procedure comprises an image generator that produces synthetic pictures by sampling in a lower dimension image modification subspace and a suite of visualization tools. The image generator produces images which can be used to test the CNN and hence expose its vulnerabilities. The presented framework can be used to extract insights of the CNN classifier, compare across classification models, or generate training and validation datasets.},
  file = {/Users/ryedida/Zotero/storage/ZI7B95PZ/Dreossi et al. - 2017 - Systematic Testing of Convolutional Neural Networks for Autonomous Driving(2).pdf}
}

@unpublished{Du2018,
  title = {{{DeepCruiser}}: {{Automated Guided Testing}} for {{Stateful Deep Learning Systems}}},
  author = {Du, Xiaoning and Xie, Xiaofei and Li, Yi and Ma, Lei and Zhao, Jianjun and Liu, Yang},
  date = {2018},
  eprint = {1812.05339},
  eprinttype = {arxiv},
  pages = {1--23},
  url = {http://arxiv.org/abs/1812.05339},
  abstract = {Deep learning (DL) defines a data-driven programming paradigm that automatically composes the system decision logic from the training data. In company with the data explosion and hardware acceleration during the past decade, DL achieves tremendous success in many cutting-edge applications. However, even the state-of-the-art DL systems still suffer from quality and reliability issues. It was only until recently that some preliminary progress was made in testing feed-forward DL systems. In contrast to feed-forward DL systems, recurrent neural networks (RNN) follow a very different architectural design, implementing temporal behaviors and memory with loops and internal states. Such stateful nature of RNN contributes to its success in handling sequential inputs such as audio, natural languages and video processing, but also poses new challenges for quality assurance. In this paper, we initiate the very first step towards testing RNN-based stateful DL systems. We model RNN as an abstract state transition system, based on which we define a set of test coverage criteria specialized for stateful DL systems. Moreover, we propose an automated testing framework, DeepCruiser, which systematically generates tests in large scale to uncover defects of stateful DL systems with coverage guidance. Our in-depth evaluation on a state-of-the-art speech-to-text DL system demonstrates the effectiveness of our technique in improving quality and reliability of stateful DL systems.}
}

@article{dumanSynapticPlasticityDepression2016,
  title = {Synaptic Plasticity and Depression: New Insights from Stress and Rapid-Acting Antidepressants},
  shorttitle = {Synaptic Plasticity and Depression},
  author = {Duman, Ronald S. and Aghajanian, George K. and Sanacora, Gerard and Krystal, John H.},
  date = {2016-03},
  journaltitle = {Nature Medicine},
  shortjournal = {Nat Med},
  volume = {22},
  number = {3},
  pages = {238--249},
  publisher = {{Nature Publishing Group}},
  issn = {1546-170X},
  doi = {10.1038/nm.4050},
  url = {https://www.nature.com/articles/nm.4050},
  urldate = {2023-09-21},
  abstract = {Ron Duman and colleagues discuss recent insights into a role for circuit disruption in the mechanisms of stress-induced depression. Furthermore they discuss the potential for rapid-acting antidepressants to alleviate these defects.},
  issue = {3},
  langid = {english},
  keywords = {Depression},
  file = {/Users/ryedida/Zotero/storage/CAIDMJ6V/Duman et al_2016_Synaptic plasticity and depression.pdf}
}

@article{Dutta2018,
  title = {Output Range Analysis for Deep Feedforward Neural Networks},
  author = {Dutta, Souradeep and Jha, Susmit and Sankaranarayanan, Sriram and Tiwari, Ashish},
  date = {2018},
  journaltitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  volume = {10811 LNCS},
  eprint = {1709.09130},
  eprinttype = {arxiv},
  pages = {121--138},
  issn = {16113349},
  doi = {10.1007/978-3-319-77935-5_9},
  abstract = {Given a neural network (NN) and a set of possible inputs to the network described by polyhedral constraints, we aim to compute a safe over-approximation of the set of possible output values. This operation is a fundamental primitive enabling the formal analysis of neural networks that are extensively used in a variety of machine learning tasks such as perception and control of autonomous systems. Increasingly, they are deployed in high-assurance applications, leading to a compelling use case for formal verification approaches. In this paper, we present an efficient range estimation algorithm that iterates between an expensive global combinatorial search using mixed-integer linear programming problems, and a relatively inexpensive local optimization that repeatedly seeks a local optimum of the function represented by the NN. We implement our approach and compare it with Reluplex, a recently proposed solver for deep neural networks. We demonstrate applications of our approach to computing flowpipes for neural network-based feedback controllers. We show that the use of local search in conjunction with mixed-integer linear programming solvers effectively reduces the combinatorial search over possible combinations of active neurons in the network by pruning away suboptimal nodes.},
  isbn = {9783319779348}
}

@article{Dvijotham2018,
  title = {A Dual Approach to Scalable Verification of Deep Networks},
  author = {Dvijotham, Krishnamurthy and Stanforth, Robert and Gowal, Sven and Mann, Timothy and Kohli, Pushmeet},
  date = {2018},
  journaltitle = {34th Conference on Uncertainty in Artificial Intelligence 2018, UAI 2018},
  volume = {2},
  eprint = {1803.06567},
  eprinttype = {arxiv},
  pages = {550--559},
  abstract = {This paper addresses the problem of formally verifying desirable properties of neural networks, i.e., obtaining provable guarantees that neural networks satisfy specifications relating their inputs and outputs (robustness to bounded norm adversarial perturbations, for example). Most previous work on this topic was limited in its applicability by the size of the network, network architecture and the complexity of properties to be verified. In contrast, our framework applies to a general class of activation functions and specifications on neural network inputs and outputs. We formulate verification as an optimization problem (seeking to find the largest violation of the specification) and solve a Lagrangian relaxation of the optimization problem to obtain an upper bound on the worst case violation of the specification being verified. Our approach is anytime i.e. it can be stopped at any time and a valid bound on the maximum violation can be obtained. We develop specialized verification algorithms with provable tightness guarantees under special assumptions and demonstrate the practical significance of our general verification approach on a variety of verification tasks.},
  isbn = {9781510871601},
  file = {/Users/ryedida/Zotero/storage/7KKMQSA3/Dvijotham et al. - 2018 - A dual approach to scalable verification of deep networks(2).pdf}
}

@unpublished{Dvijotham2018a,
  title = {Training Verified Learners with Learned Verifiers},
  author = {Dvijotham, Krishnamurthy and Gowal, Sven and Stanforth, Robert and Arandjelovic, Relja and O'Donoghue, Brendan and Uesato, Jonathan and Kohli, Pushmeet},
  date = {2018},
  eprint = {1805.10265},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1805.10265},
  abstract = {This paper proposes a new algorithmic framework, predictor-verifier training, to train neural networks that are verifiable, i.e., networks that provably satisfy some desired input-output properties. The key idea is to simultaneously train two networks: a predictor network that performs the task at hand,e.g., predicting labels given inputs, and a verifier network that computes a bound on how well the predictor satisfies the properties being verified. Both networks can be trained simultaneously to optimize a weighted combination of the standard data-fitting loss and a term that bounds the maximum violation of the property. Experiments show that not only is the predictor-verifier architecture able to train networks to achieve state of the art verified robustness to adversarial examples with much shorter training times (outperforming previous algorithms on small datasets like MNIST and SVHN), but it can also be scaled to produce the first known (to the best of our knowledge) verifiably robust networks for CIFAR-10.},
  file = {/Users/ryedida/Zotero/storage/ZKXB74NG/Dvijotham et al. - 2018 - Training verified learners with learned verifiers(2).pdf}
}

@online{dziugaiteComputingNonvacuousGeneralization2017,
  title = {Computing {{Nonvacuous Generalization Bounds}} for {{Deep}} ({{Stochastic}}) {{Neural Networks}} with {{Many More Parameters}} than {{Training Data}}},
  author = {Dziugaite, Gintare Karolina and Roy, Daniel M.},
  date = {2017-10-18},
  eprint = {1703.11008},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/1703.11008},
  urldate = {2023-11-26},
  abstract = {One of the defining properties of deep learning is that models are chosen to have many more parameters than available training data. In light of this capacity for overfitting, it is remarkable that simple algorithms like SGD reliably return solutions with low test error. One roadblock to explaining these phenomena in terms of implicit regularization, structural properties of the solution, and/or easiness of the data is that many learning bounds are quantitatively vacuous when applied to networks learned by SGD in this “deep learning” regime. Logically, in order to explain generalization, we need nonvacuous bounds. We return to an idea by Langford and Caruana (2001), who used PAC-Bayes bounds to compute nonvacuous numerical bounds on generalization error for stochastic two-layer two-hidden-unit neural networks via a sensitivity analysis. By optimizing the PAC-Bayes bound directly, we are able to extend their approach and obtain nonvacuous generalization bounds for deep stochastic neural network classifiers with millions of parameters trained on only tens of thousands of examples. We connect our findings to recent and old work on flat minima and MDL-based explanations of generalization.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/UBXJ6SBI/Dziugaite and Roy - 2017 - Computing Nonvacuous Generalization Bounds for Dee.pdf}
}

@article{Ehlers2017,
  title = {Formal Verification of Piece-Wise Linear Feed-Forward Neural Networks},
  author = {Ehlers, Rüdiger},
  date = {2017},
  journaltitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  volume = {10482 LNCS},
  eprint = {1705.01320},
  eprinttype = {arxiv},
  pages = {269--286},
  issn = {16113349},
  doi = {10.1007/978-3-319-68167-2_19},
  abstract = {We present an approach for the verification of feed-forward neural networks in which all nodes have a piece-wise linear activation function. Such networks are often used in deep learning and have been shown to be hard to verify for modern satisfiability modulo theory (SMT) and integer linear programming (ILP) solvers. The starting point of our approach is the addition of a global linear approximation of the overall network behavior to the verification problem that helps with SMT-like reasoning over the network behavior. We present a specialized verification algorithm that employs this approximation in a search process in which it infers additional node phases for the non-linear nodes in the network from partial node phase assignments, similar to unit propagation in classical SAT solving. We also show how to infer additional conflict clauses and safe node fixtures from the results of the analysis steps performed during the search. The resulting approach is evaluated on collision avoidance and handwritten digit recognition case studies.},
  isbn = {9783319681665},
  file = {/Users/ryedida/Zotero/storage/6YQN6HZJ/Ehlers - 2017 - Formal verification of piece-wise linear feed-forward neural networks(2).pdf}
}

@unpublished{elsayedAdversarialExamplesThat2018,
  title = {Adversarial {{Examples}} That {{Fool}} Both {{Computer Vision}} and {{Time-Limited Humans}}},
  author = {Elsayed, Gamaleldin F. and Shankar, Shreya and Cheung, Brian and Papernot, Nicolas and Kurakin, Alex and Goodfellow, Ian and Sohl-Dickstein, Jascha},
  date = {2018-05-21},
  eprint = {1802.08195},
  eprinttype = {arxiv},
  eprintclass = {cs, q-bio, stat},
  url = {http://arxiv.org/abs/1802.08195},
  urldate = {2021-04-16},
  abstract = {Machine learning models are vulnerable to adversarial examples: small changes to images can cause computer vision models to make mistakes such as identifying a school bus as an ostrich. However, it is still an open question whether humans are prone to similar mistakes. Here, we address this question by leveraging recent techniques that transfer adversarial examples from computer vision models with known parameters and architecture to other models with unknown parameters and architecture, and by matching the initial processing of the human visual system. We find that adversarial examples that strongly transfer across computer vision models influence the classifications made by time-limited human observers.},
  langid = {english},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Quantitative Biology - Neurons and Cognition,Statistics - Machine Learning},
  annotation = {129 citations (Semantic Scholar/arXiv) [2021-04-16]},
  file = {/Users/ryedida/Zotero/storage/3AVVVEXM/Elsayed et al. - 2018 - Adversarial Examples that Fool both Computer Visio.pdf}
}

@unpublished{elsken2017simple,
  title = {Simple and Efficient Architecture Search for Convolutional Neural Networks},
  author = {Elsken, Thomas and Metzen, Jan-Hendrik and Hutter, Frank},
  date = {2017},
  eprint = {1711.04528},
  eprinttype = {arxiv}
}

@article{elsken2019neural,
  title = {Neural Architecture Search: {{A}} Survey},
  author = {Elsken, Thomas and Metzen, Jan Hendrik and Hutter, Frank},
  date = {2019},
  journaltitle = {The Journal of Machine Learning Research},
  volume = {20},
  number = {1},
  pages = {1997--2017},
  publisher = {{JMLR. org}}
}

@article{elskenNeuralArchitectureSearch,
  title = {Neural {{Architecture Search}}: {{A Survey}}},
  author = {Elsken, Thomas and Metzen, Jan Hendrik and Hutter, Frank},
  abstract = {Deep Learning has enabled remarkable progress over the last years on a variety of tasks, such as image recognition, speech recognition, and machine translation. One crucial aspect for this progress are novel neural architectures. Currently employed architectures have mostly been developed manually by human experts, which is a time-consuming and errorprone process. Because of this, there is growing interest in automated neural architecture search methods. We provide an overview of existing work in this field of research and categorize them according to three dimensions: search space, search strategy, and performance estimation strategy.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/9PDZ3QKJ/Elsken et al. - Neural Architecture Search A Survey.pdf}
}

@article{eriksson2019scalable,
  title = {Scalable Global Optimization via Local Bayesian Optimization},
  author = {Eriksson, David and Pearce, Michael and Gardner, Jacob and Turner, Ryan D and Poloczek, Matthias},
  date = {2019},
  journaltitle = {Advances in neural information processing systems},
  volume = {32}
}

@article{espadotoQuantitativeSurveyDimension2021,
  title = {Toward a {{Quantitative Survey}} of {{Dimension Reduction Techniques}}},
  author = {Espadoto, Mateus and Martins, Rafael M. and Kerren, Andreas and Hirata, Nina S. T. and Telea, Alexandru C.},
  date = {2021-03-01},
  journaltitle = {IEEE Transactions on Visualization and Computer Graphics},
  shortjournal = {IEEE Trans. Visual. Comput. Graphics},
  volume = {27},
  number = {3},
  pages = {2153--2173},
  issn = {1077-2626, 1941-0506, 2160-9306},
  doi = {10.1109/TVCG.2019.2944182},
  url = {https://ieeexplore.ieee.org/document/8851280/},
  urldate = {2023-11-14},
  abstract = {Dimensionality reduction methods, also known as projections, are frequently used in multidimensional data exploration in machine learning, data science, and information visualization. Tens of such techniques have been proposed, aiming to address a wide set of requirements, such as ability to show the high-dimensional data structure, distance or neighborhood preservation, computational scalability, stability to data noise and/or outliers, and practical ease of use. However, it is far from clear for practitioners how to choose the best technique for a given use context. We present a survey of a wide body of projection techniques that helps answering this question. For this, we characterize the input data space, projection techniques, and the quality of projections, by several quantitative metrics. We sample these three spaces according to these metrics, aiming at good coverage with bounded effort. We describe our measurements and outline observed dependencies of the measured variables. Based on these results, we draw several conclusions that help comparing projection techniques, explain their results for different types of data, and ultimately help practitioners when choosing a projection for a given context. Our methodology, datasets, projection implementations, metrics, visualizations, and results are publicly open, so interested stakeholders can examine and/or extend this benchmark.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/4YLBCHR3/Espadoto et al. - 2021 - Toward a Quantitative Survey of Dimension Reductio.pdf}
}

@article{Faber2018,
  title = {How the Stimulus Influences Mind Wandering in Semantically-Rich Task Contexts},
  author = {Faber, M. and D'Mello, Sidney K.},
  date = {2018},
  journaltitle = {Cognitive Research: Principles and Implications},
  publisher = {{Cognitive Research: Principles and Implications}},
  issn = {1472-4642},
  doi = {10.1111/j.1472-4642.2011.00801.x},
  abstract = {Aim\hspace{0.6em} The study was aimed at testing whether West Africa can be regarded as a distinct biogeographic region based on amphibian assemblages. If so, we asked what were the relationships of these assemblages with those in Central Africa, and whether West African amphibian distributions showed biogeographic substructure. We further investigated what events or processes may explain the observed patterns.},
  isbn = {4123501801290},
  keywords = {attention,comprehension,memory,mind wandering,{Mind wandering,Attention,Memory,Comprehension}},
  file = {/Users/ryedida/Zotero/storage/5HFGFFRH/Faber, D'Mello - 2018 - How the stimulus influences mind wandering in semantically-rich task contexts(2).pdf}
}

@inproceedings{falkner2018bohb,
  title = {{{BOHB}}: {{Robust}} and Efficient Hyperparameter Optimization at Scale},
  booktitle = {International Conference on Machine Learning},
  author = {Falkner, Stefan and Klein, Aaron and Hutter, Frank},
  date = {2018},
  pages = {1437--1446},
  publisher = {{PMLR}},
  file = {/Users/ryedida/Zotero/storage/KM8C7WM2/Falkner et al_2018_BOHB.pdf}
}

@article{farabet2012learning,
  title = {Learning Hierarchical Features for Scene Labeling},
  author = {Farabet, Clement and Couprie, Camille and Najman, Laurent and LeCun, Yann},
  date = {2012},
  journaltitle = {IEEE transactions on pattern analysis and machine intelligence},
  volume = {35},
  number = {8},
  pages = {1915--1929},
  publisher = {{IEEE}}
}

@article{farcomeniReviewModernMultiple2008,
  title = {A Review of Modern Multiple Hypothesis Testing, with Particular Attention to the False Discovery Proportion},
  author = {Farcomeni, Alessio},
  date = {2008},
  journaltitle = {Statistical methods in medical research},
  volume = {17},
  number = {4},
  pages = {347--388},
  publisher = {{SAGE Publications Sage UK: London, England}},
  isbn = {0962-2802}
}

@unpublished{fawziAdversarialVulnerabilityAny2018,
  title = {Adversarial Vulnerability for Any Classifier},
  author = {Fawzi, Alhussein and Fawzi, Hamza and Fawzi, Omar},
  date = {2018-11-30},
  eprint = {1802.08686},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1802.08686},
  urldate = {2021-03-28},
  abstract = {Despite achieving impressive performance, state-of-the-art classifiers remain highly vulnerable to small, imperceptible, adversarial perturbations. This vulnerability has proven empirically to be very intricate to address. In this paper, we study the phenomenon of adversarial perturbations under the assumption that the data is generated with a smooth generative model. We derive fundamental upper bounds on the robustness to perturbations of any classification function, and prove the existence of adversarial perturbations that transfer well across different classifiers with small risk. Our analysis of the robustness also provides insights onto key properties of generative models, such as their smoothness and dimensionality of latent space. We conclude with numerical experimental results showing that our bounds provide informative baselines to the maximal achievable robustness on several datasets.},
  langid = {english},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Cryptography and Security,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/3XT4I8LC/Fawzi et al. - 2018 - Adversarial vulnerability for any classifier.pdf}
}

@article{Fayyad1992,
  title = {On the {{Handling}} of {{Continuous-Valued Attributes}} in {{Decision Tree Generation}}},
  author = {Fayyad, Usama M. and Irani, Keki B.},
  date = {1992},
  journaltitle = {Machine Learning},
  volume = {8},
  number = {1},
  pages = {87--102},
  issn = {15730565},
  doi = {10.1023/A:1022638503176},
  abstract = {We present a result applicable to classification learning algorithms that generate decision trees or rules using the information entropy minimization heuristic for discretizing continuous-valued attributes. The result serves to give a better understanding of the entropy measure, to point out that the behavior of the information entropy heuristic possesses desirable properties that justify its usage in a formal sense, and to improve the efficiency of evaluating continuous-valued attributes for cut value selection. Along with the formal proof, we present empirical results that demonstrate the theoretically expected reduction in evaluation effort for training data sets from real-world domains.},
  isbn = {0885-6125},
  keywords = {classification,decision trees,discretization,empirical concept learning,Induction,information entropy minimization}
}

@article{fengActivityWeightDuality2023a,
  title = {Activity–Weight Duality in Feed-Forward Neural Networks Reveals Two Co-Determinants for Generalization},
  author = {Feng, Yu and Zhang, Wei and Tu, Yuhai},
  date = {2023},
  journaltitle = {Nature Machine Intelligence},
  volume = {5},
  number = {8},
  pages = {908--918},
  publisher = {{Nature Publishing Group UK London}},
  isbn = {2522-5839},
  file = {/Users/ryedida/Zotero/storage/VAYF37LR/Feng et al_2023_Activity–weight duality in feed-forward neural networks reveals two.pdf}
}

@incollection{feurer2019hyperparameter,
  title = {Hyperparameter Optimization},
  booktitle = {Automated Machine Learning},
  author = {Feurer, Matthias and Hutter, Frank},
  date = {2019},
  pages = {3--33},
  publisher = {{Springer, Cham}}
}

@article{feurerHyperparameterOptimization2019,
  title = {Hyperparameter Optimization},
  author = {Feurer, Matthias and Hutter, Frank},
  date = {2019},
  journaltitle = {Automated machine learning: Methods, systems, challenges},
  pages = {3--33},
  publisher = {{Springer International Publishing}},
  isbn = {3030053172},
  file = {/Users/ryedida/Zotero/storage/8YB3CGPD/Feurer and Hutter - 2019 - Hyperparameter optimization.pdf}
}

@inproceedings{fialho2010comparison,
  title = {Comparison-Based Adaptive Strategy Selection with Bandits in Differential Evolution},
  booktitle = {Parallel Problem Solving from Nature, {{PPSN XI}}: 11th International Conference, Kraków, Poland, September 11-15, 2010, Proceedings, Part {{I}} 11},
  author = {Fialho, Alvaro and Ros, Raymond and Schoenauer, Marc and Sebag, Michele},
  date = {2010},
  pages = {194--203},
  publisher = {{Springer}}
}

@unpublished{Fire2018,
  title = {Over-{{Optimization}} of {{Academic Publishing Metrics}}: {{Observing Goodhart}}'s {{Law}} in {{Action}}},
  author = {Fire, Michael and Guestrin, Carlos},
  date = {2018},
  eprint = {1809.07841},
  eprinttype = {arxiv},
  pages = {1--20},
  publisher = {{Oxford University Press}},
  doi = {10.1093/gigascience/giz053},
  url = {http://arxiv.org/abs/1809.07841},
  abstract = {The academic publishing world is changing significantly, with ever-growing numbers of publications each year and shifting publishing patterns. However, the metrics used to measure academic success, such as the number of publications, citation number, and impact factor, have not changed for decades. Moreover, recent studies indicate that these metrics have become targets and follow Goodhart's Law, according to which "when a measure becomes a target, it ceases to be a good measure." In this study, we analyzed over 120 million papers to examine how the academic publishing world has evolved over the last century. Our study shows that the validity of citation-based measures is being compromised and their usefulness is lessening. In particular, the number of publications has ceased to be a good metric as a result of longer author lists, shorter papers, and surging publication numbers. Citation-based metrics, such citation number and h-index, are likewise affected by the flood of papers, self-citations, and lengthy reference lists. Measures such as a journal's impact factor have also ceased to be good metrics due to the soaring numbers of papers that are published in top journals, particularly from the same pool of authors. Moreover, by analyzing properties of over 2600 research fields, we observed that citation-based metrics are not beneficial for comparing researchers in different fields, or even in the same department. Academic publishing has changed considerably; now we need to reconsider how we measure success.},
  issue = {April},
  keywords = {academic publishing metrics,big data,data science,goodhart,s law,science of science,scientometrics}
}

@article{Fischetti2018,
  title = {Deep Neural Networks and Mixed Integer Linear Optimization},
  author = {Fischetti, Matteo and Jo, Jason},
  date = {2018},
  journaltitle = {Constraints},
  volume = {23},
  number = {3},
  pages = {296--309},
  publisher = {{Constraints}},
  issn = {15729354},
  doi = {10.1007/s10601-018-9285-6},
  abstract = {Deep Neural Networks (DNNs) are very popular these days, and are the subject of a very intense investigation. A DNN is made up of layers of internal units (or neurons), each of which computes an affine combination of the output of the units in the previous layer, applies a nonlinear operator, and outputs the corresponding value (also known as activation). A commonly-used nonlinear operator is the so-called rectified linear unit (ReLU), whose output is just the maximum between its input value and zero. In this (and other similar cases like max pooling, where the max operation involves more than one input value), for fixed parameters one can model the DNN as a 0-1 Mixed Integer Linear Program (0-1 MILP) where the continuous variables correspond to the output values of each unit, and a binary variable is associated with each ReLU to model its yes/no nature. In this paper we discuss the peculiarity of this kind of 0-1 MILP models, and describe an effective bound-tightening technique intended to ease its solution. We also present possible applications of the 0-1 MILP model arising in feature visualization and in the construction of adversarial examples. Computational results are reported, aimed at investigating (on small DNNs) the computational performance of a state-of-the-art MILP solver when applied to a known test case, namely, hand-written digit recognition.},
  keywords = {Computational experiments,Deep learning,Deep neural networks,Mathematical optimization,Mixed-integer programming}
}

@report{Flamary,
  title = {Astronomical {{Image Reconstruction}} with {{Convolutional Neural Networks}}},
  author = {Flamary, Rémi},
  eprint = {1612.04526v2},
  eprinttype = {arxiv},
  url = {http://archive.stsci.edu/cgi-bin/dss},
  abstract = {State of the art methods in astronomical image reconstruction rely on the resolution of a regularized or constrained optimization problem. Solving this problem can be computationally intensive and usually leads to a quadratic or at least superlinear complexity w.r.t. the number of pixels in the image. We investigate in this work the use of convolutional neural networks for image reconstruction in astronomy. With neural networks, the computationally intensive tasks is the training step, but the prediction step has a fixed complexity per pixel, i.e. a linear complexity. Numerical experiments show that our approach is both computationally efficient and competitive with other state of the art methods in addition to being interpretable.},
  file = {/Users/ryedida/Zotero/storage/F9K33P95/Flamary - Unknown - Astronomical Image Reconstruction with Convolutional Neural Networks(2).pdf}
}

@unpublished{frankleLotteryTicketHypothesis2019,
  title = {The {{Lottery Ticket Hypothesis}}: {{Finding Sparse}}, {{Trainable Neural Networks}}},
  shorttitle = {The {{Lottery Ticket Hypothesis}}},
  author = {Frankle, Jonathan and Carbin, Michael},
  date = {2019-03-04},
  eprint = {1803.03635},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/1803.03635},
  urldate = {2021-04-15},
  abstract = {Neural network pruning techniques can reduce the parameter counts of trained networks by over 90\%, decreasing storage requirements and improving computational performance of inference without compromising accuracy. However, contemporary experience is that the sparse architectures produced by pruning are difficult to train from the start, which would similarly improve training performance.},
  langid = {english},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing},
  annotation = {687 citations (Semantic Scholar/arXiv) [2021-04-15]},
  file = {/Users/ryedida/Zotero/storage/GYD63M75/Frankle and Carbin - 2019 - The Lottery Ticket Hypothesis Finding Sparse, Tra.pdf}
}

@online{frazierTutorialBayesianOptimization2018,
  title = {A {{Tutorial}} on {{Bayesian Optimization}}},
  author = {Frazier, Peter I.},
  date = {2018-07-08},
  eprint = {1807.02811},
  eprinttype = {arxiv},
  eprintclass = {cs, math, stat},
  url = {http://arxiv.org/abs/1807.02811},
  urldate = {2024-01-05},
  abstract = {Bayesian optimization is an approach to optimizing objective functions that take a long time (minutes or hours) to evaluate. It is best-suited for optimization over continuous domains of less than 20 dimensions, and tolerates stochastic noise in function evaluations. It builds a surrogate for the objective and quantifies the uncertainty in that surrogate using a Bayesian machine learning technique, Gaussian process regression, and then uses an acquisition function defined from this surrogate to decide where to sample. In this tutorial, we describe how Bayesian optimization works, including Gaussian process regression and three common acquisition functions: expected improvement, entropy search, and knowledge gradient. We then discuss more advanced techniques, including running multiple function evaluations in parallel, multi-fidelity and multi-information source optimization, expensive-to-evaluate constraints, random environmental conditions, multi-task Bayesian optimization, and the inclusion of derivative information. We conclude with a discussion of Bayesian optimization software and future research directions in the field. Within our tutorial material we provide a generalization of expected improvement to noisy evaluations, beyond the noise-free setting where it is more commonly applied. This generalization is justified by a formal decision-theoretic argument, standing in contrast to previous ad hoc modifications.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning,Mathematics - Optimization and Control,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/Q95TZY66/Frazier - 2018 - A Tutorial on Bayesian Optimization.pdf}
}

@article{Frosst2018,
  title = {Distilling a Neural Network into a Soft Decision Tree},
  author = {Frosst, Nicholas and family=Hinton, given=Geo, prefix=rey, useprefix=false},
  date = {2018},
  journaltitle = {CEUR Workshop Proceedings},
  volume = {2071},
  eprint = {1711.09784v1},
  eprinttype = {arxiv},
  issn = {16130073},
  abstract = {Deep neural networks have proved to be a very effective way to perform classification tasks. They excel when the input data is high dimensional, the relationship between the input and the output is complicated, and the number of labeled training examples is large. But it is hard to explain why a learned network makes a particular classification decision on a particular test case. This is due to their reliance on distributed hierarchical representations. If we could take the knowledge acquired by the neural net and express the same knowledge in a model that relies on hierarchical decisions instead, explaining a particular decision would be much easier. We describe a way of using a trained neural net to create a type of soft decision tree that generalizes better than one learned directly from the training data.},
  file = {/Users/ryedida/Zotero/storage/WL98J4C6/Frosst, Hinton - 2018 - Distilling a neural network into a soft decision tree(2).pdf}
}

@article{Fu16ist,
  title = {Tuning for Software Analytics: {{Is}} It Really Necessary?},
  author = {Fu, Wei and Menzies, Tim and Shen, Xipeng},
  date = {2016},
  journaltitle = {Information and Software Technology},
  volume = {76},
  pages = {135--146},
  issn = {0950-5849},
  doi = {10.1016/j.infsof.2016.04.017},
  url = {https://www.sciencedirect.com/science/article/pii/S0950584916300738},
  abstract = {Context: Data miners have been widely used in software engineering to, say, generate defect predictors from static code measures. Such static code defect predictors perform well compared to manual methods, and they are easy to use and useful to use. But one of the “black arts” of data mining is setting the tunings that control the miner. Objective: We seek simple, automatic, and very effective method for finding those tunings. Method: For each experiment with different data sets (from open source JAVA systems), we ran differential evolution as an optimizer to explore the tuning space (as a first step) then tested the tunings using hold-out data. Results: Contrary to our prior expectations, we found these tunings were remarkably simple: it only required tens, not thousands, of attempts to obtain very good results. For example, when learning software defect predictors, this method can quickly find tunings that alter detection precision from 0\% to 60\%. Conclusion: Since (1) the improvements are so large, and (2) the tuning is so simple, we need to change standard methods in software analytics. At least for defect prediction, it is no longer enough to just run a data miner and present the result without conducting a tuning optimization study. The implication for other kinds of analytics is now an open and pressing issue.},
  keywords = {CART,Defect prediction,Differential evolution,Random forest,Search-based software engineering}
}

@unpublished{Furlanello2018,
  title = {Born {{Again Neural Networks}}},
  author = {Furlanello, Tommaso and Lipton, Zachary C. and Tschannen, Michael and Itti, Laurent and Anandkumar, Anima},
  date = {2018},
  eprint = {1805.04770},
  eprinttype = {arxiv},
  issn = {15746267},
  doi = {10.1016/j.aenj.2007.06.003},
  url = {http://arxiv.org/abs/1805.04770},
  abstract = {Knowledge distillation (KD) consists of transferring knowledge from one machine learning model (the teacher\vphantom\{\}) to another (the student). Commonly, the teacher is a high-capacity model with formidable performance, while the student is more compact. By transferring knowledge, one hopes to benefit from the student's compactness. \%we desire a compact model with performance close to the teacher's. We study KD from a new perspective: rather than compressing models, we train students parameterized identically to their teachers. Surprisingly, these \{Born-Again Networks (BANs), outperform their teachers significantly, both on computer vision and language modeling tasks. Our experiments with BANs based on DenseNets demonstrate state-of-the-art performance on the CIFAR-10 (3.5\%) and CIFAR-100 (15.5\%) datasets, by validation error. Additional experiments explore two distillation objectives: (i) Confidence-Weighted by Teacher Max (CWTM) and (ii) Dark Knowledge with Permuted Predictions (DKPP). Both methods elucidate the essential components of KD, demonstrating a role of the teacher outputs on both predicted and non-predicted classes. We present experiments with students of various capacities, focusing on the under-explored case where students overpower teachers. Our experiments show significant advantages from transferring knowledge between DenseNets and ResNets in either direction.\vphantom\}},
  file = {/Users/ryedida/Zotero/storage/HIK5QA5Q/Furlanello et al. - 2018 - Born Again Neural Networks(2).pdf}
}

@unpublished{gaierWeightAgnosticNeural2019,
  title = {Weight {{Agnostic Neural Networks}}},
  author = {Gaier, Adam and Ha, David},
  date = {2019-09-05},
  eprint = {1906.04358},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1906.04358},
  urldate = {2021-04-13},
  abstract = {Not all neural network architectures are created equal, some perform much better than others for certain tasks. But how important are the weight parameters of a neural network compared to its architecture? In this work, we question to what extent neural network architectures alone, without learning any weight parameters, can encode solutions for a given task. We propose a search method for neural network architectures that can already perform a task without any explicit weight training. To evaluate these networks, we populate the connections with a single shared weight parameter sampled from a uniform random distribution, and measure the expected performance. We demonstrate that our method can find minimal neural network architectures that can perform several reinforcement learning tasks without weight training. On a supervised learning domain, we find network architectures that achieve much higher than chance accuracy on MNIST using random weights. Interactive version of this paper at https://weightagnostic.github.io/},
  langid = {english},
  keywords = {Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning},
  annotation = {101 citations (Semantic Scholar/arXiv) [2021-04-13]},
  file = {/Users/ryedida/Zotero/storage/H7NYTUH7/Gaier and Ha - 2019 - Weight Agnostic Neural Networks.pdf}
}

@unpublished{galkeForgetMeNot2021,
  title = {Forget Me Not: {{A Gentle Reminder}} to {{Mind}} the {{Simple Multi-Layer Perceptron Baseline}} for {{Text Classification}}},
  shorttitle = {Forget Me Not},
  author = {Galke, Lukas and Scherp, Ansgar},
  date = {2021-09-23},
  eprint = {2109.03777},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2109.03777},
  urldate = {2021-09-29},
  abstract = {Graph neural networks have triggered a resurgence of graph-based text classification. We show that already a simple MLP baseline achieves comparable performance on benchmark datasets, questioning the importance of synthetic graph structures. When considering an inductive scenario, i. e., when adding new documents to a corpus, a simple MLP even outperforms the recent graph-based models TextGCN and HeteGCN and is comparable with HyperGAT. We further fine-tune DistilBERT and find that it outperforms all state-ofthe-art models. We suggest that future studies use at least an MLP baseline to contextualize the results. We provide recommendations for the design and training of such a baseline.},
  langid = {english},
  keywords = {Computer Science - Computation and Language,Computer Science - Information Retrieval,Computer Science - Machine Learning,I.2.7,simple},
  annotation = {0 citations (Semantic Scholar/arXiv) [2021-09-28]},
  file = {/Users/ryedida/Zotero/storage/XEUDC5HK/Galke and Scherp - 2021 - Forget me not A Gentle Reminder to Mind the Simpl.pdf}
}

@article{gaoAutomatingRemovalObsolete2021,
  title = {Automating the {{Removal}} of {{Obsolete TODO Comments}}},
  author = {Gao, Zhipeng and Xia, Xin and Lo, David and Grundy, John and Zimmermann, Thomas},
  date = {2021-08-20},
  journaltitle = {Proceedings of the 29th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering},
  eprint = {2108.05846},
  eprinttype = {arxiv},
  pages = {218--229},
  doi = {10.1145/3468264.3468553},
  url = {http://arxiv.org/abs/2108.05846},
  urldate = {2022-01-01},
  abstract = {TODO comments are very widely used by software developers to describe their pending tasks during software development. However, after performing the task developers sometimes neglect or simply forget to remove the TODO comment, resulting in obsolete TODO comments. These obsolete TODO comments can confuse development teams and may cause the introduction of bugs in the future, decreasing the software's quality and maintainability. In this work, we propose a novel model, named TDCleaner (TODO comment Cleaner), to identify obsolete TODO comments in software projects. TDCleaner can assist developers in just-in-time checking of TODO comments status and avoid leaving obsolete TODO comments. Our approach has two main stages: offline learning and online prediction. During offline learning, we first automatically establish {$<$}code\_change, todo\_comment, commit\_msg{$>$} training samples and leverage three neural encoders to capture the semantic features of TODO comment, code change and commit message respectively. TDCleaner then automatically learns the correlations and interactions between different encoders to estimate the final status of the TODO comment. For online prediction, we check a TODO comment's status by leveraging the offline trained model to judge the TODO comment's likelihood of being obsolete. We built our dataset by collecting TODO comments from the top-10,000 Python and Java Github repositories and evaluated TDCleaner on them. Extensive experimental results show the promising performance of our model over a set of benchmarks. We also performed an in-the-wild evaluation with real-world software projects, we reported 18 obsolete TODO comments identified by TDCleaner to Github developers and 9 of them have already been confirmed and removed by the developers, demonstrating the practical usage of our approach.},
  langid = {english},
  keywords = {Computer Science - Software Engineering},
  annotation = {0 citations (Semantic Scholar/arXiv) [2022-01-01] 0 citations (Semantic Scholar/DOI) [2022-01-01]},
  file = {/Users/ryedida/Zotero/storage/NZFMQ7P4/Gao et al. - 2021 - Automating the Removal of Obsolete TODO Comments.pdf}
}

@article{gaoEnCoSumEnhancedSemantic2023,
  title = {{{EnCoSum}}: Enhanced Semantic Features for Multi-Scale Multi-Modal Source Code Summarization},
  shorttitle = {{{EnCoSum}}},
  author = {Gao, Yuexiu and Zhang, Hongyu and Lyu, Chen},
  date = {2023-09-19},
  journaltitle = {Empirical Software Engineering},
  shortjournal = {Empir Software Eng},
  volume = {28},
  number = {5},
  pages = {126},
  issn = {1573-7616},
  doi = {10.1007/s10664-023-10384-x},
  url = {https://doi.org/10.1007/s10664-023-10384-x},
  urldate = {2023-10-10},
  abstract = {Code summarization aims to generate concise natural language descriptions for a piece of code, which can help developers comprehend the source code. Analysis of current work shows that the extraction of syntactic and semantic features of source code is crucial for generating high-quality summaries. To provide a more comprehensive feature representation of source code from different perspectives, we propose an approach named EnCoSum, which enhances semantic features for the multi-scale multi-modal code summarization method. This method complements our previously proposed M2TS approach (multi-scale multi-modal approach based on Transformer for source code summarization), which uses the multi-scale method to capture Abstract Syntax Trees (ASTs) structural information more completely and accurately at multiple local and global levels. In addition, we devise a new cross-modal fusion method to fuse source code and AST features, which can highlight key features in each modality that help generate summaries. To obtain richer semantic information, we improve M2TS. First, we add data flow and control flow to ASTs, and added-edge ASTs, called Enhanced-ASTs (E-ASTs). In addition, we introduce method name sequences extracted in the source code, which exist more knowledge about critical tokens in the corresponding summaries and can help the model generate higher-quality summaries. We conduct extensive experiments on processed Java and Python datasets and evaluate our approach via the four most commonly used machine translation metrics. The experimental results demonstrate that EnCoSum is effective and outperforms current state-of-the-art methods. Further, we perform ablation experiments on each of the model’s key components, and the results show that they all contribute to the performance of EnCoSum.},
  langid = {english},
  keywords = {Abstract syntax trees,Code summarization,Cross-modal fusion,Deep learning,Method name sequences},
  file = {/Users/ryedida/Zotero/storage/BG47AVDV/Gao et al_2023_EnCoSum.pdf}
}

@article{gaoPALProgramaidedLanguage,
  title = {{{PAL}}: {{Program-aided Language Models}}},
  author = {Gao, Luyu and Madaan, Aman and Zhou, Shuyan and Alon, Uri and Liu, Pengfei and Yang, Yiming and Callan, Jamie and Neubig, Graham},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/FFR3RVW9/Gao et al. - PAL Program-aided Language Models.pdf}
}

@article{garipovLossSurfacesMode,
  title = {Loss {{Surfaces}}, {{Mode Connectivity}}, and {{Fast Ensembling}} of {{DNNs}}},
  author = {Garipov, Timur and Izmailov, Pavel and Podoprikhin, Dmitrii and Vetrov, Dmitry P and Wilson, Andrew G},
  pages = {10},
  abstract = {The loss functions of deep neural networks are complex and their geometric properties are not well understood. We show that the optima of these complex loss functions are in fact connected by simple curves over which training and test accuracy are nearly constant. We introduce a training procedure to discover these high-accuracy pathways between modes. Inspired by this new geometric insight, we also propose a new ensembling method entitled Fast Geometric Ensembling (FGE). Using FGE we can train high-performing ensembles in the time required to train a single model. We achieve improved performance compared to the recent state-of-the-art Snapshot Ensembles, on CIFAR-10, CIFAR-100, and ImageNet.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/TQ64ZDPT/Garipov et al. - Loss Surfaces, Mode Connectivity, and Fast Ensembl.pdf}
}

@article{gebruDatasheetsDatasets2021,
  title = {Datasheets for Datasets},
  author = {Gebru, Timnit and Morgenstern, Jamie and Vecchione, Briana and Vaughan, Jennifer Wortman and Wallach, Hanna and Iii, Hal Daumé and Crawford, Kate},
  date = {2021-12},
  journaltitle = {Communications of the ACM},
  shortjournal = {Commun. ACM},
  volume = {64},
  number = {12},
  pages = {86--92},
  issn = {0001-0782, 1557-7317},
  doi = {10.1145/3458723},
  url = {https://dl.acm.org/doi/10.1145/3458723},
  urldate = {2021-12-05},
  abstract = {The machine learning community has no standardized way to document how and why a dataset was created, what information it contains, what tasks it should and should not be used for, and whether it might raise any ethical or legal concerns. To address this gap, we propose the concept of datasheets for datasets. In the electronics industry, it is standard to accompany every component with a datasheet providing standard operating characteristics, test results, recommended usage, and other information. Similarly, we recommend that every dataset be accompanied with a datasheet documenting its creation, composition, intended uses, maintenance, and other properties. Datasheets for datasets will facilitate better communication between dataset creators and users, and encourage the machine learning community to prioritize transparency and accountability.},
  langid = {english},
  annotation = {427 citations (Semantic Scholar/DOI) [2021-12-04]},
  file = {/Users/ryedida/Zotero/storage/D4VTG5T7/Gebru et al. - 2021 - Datasheets for datasets.pdf}
}

@article{Gehr2018,
  title = {{{AI2}}: {{Safety}} and {{Robustness Certification}} of {{Neural Networks}} with {{Abstract Interpretation}}},
  author = {Gehr, Timon and Mirman, Matthew and Drachsler-Cohen, Dana and Tsankov, Petar and Chaudhuri, Swarat and Vechev, Martin},
  date = {2018},
  journaltitle = {Proceedings - IEEE Symposium on Security and Privacy},
  volume = {2018-May},
  pages = {3--18},
  publisher = {{IEEE}},
  issn = {10816011},
  doi = {10.1109/SP.2018.00058},
  abstract = {We present AI2, the first sound and scalable analyzer for deep neural networks. Based on overapproximation, AI2 can automatically prove safety properties (e.g., robustness) of realistic neural networks (e.g., convolutional neural networks). The key insight behind AI2 is to phrase reasoning about safety and robustness of neural networks in terms of classic abstract interpretation, enabling us to leverage decades of advances in that area. Concretely, we introduce abstract transformers that capture the behavior of fully connected and convolutional neural network layers with rectified linear unit activations (ReLU), as well as max pooling layers. This allows us to handle real-world neural networks, which are often built out of those types of layers. We present a complete implementation of AI2 together with an extensive evaluation on 20 neural networks. Our results demonstrate that: (i) AI2 is precise enough to prove useful specifications (e.g., robustness), (ii) AI2 can be used to certify the effectiveness of state-of-the-art defenses for neural networks, (iii) AI2 is significantly faster than existing analyzers based on symbolic analysis, which often take hours to verify simple fully connected networks, and (iv) AI2 can handle deep convolutional networks, which are beyond the reach of existing methods.},
  isbn = {9781538643525},
  keywords = {Abstract Interpretation,Neural Networks,Reliable Machine Learning,Robustness},
  file = {/Users/ryedida/Zotero/storage/HEHPFS4A/Gehr et al. - 2018 - AI2 Safety and Robustness Certification of Neural Networks with Abstract Interpretation(2).pdf}
}

@article{gharibiAutomatedEndtoendManagement2021,
  title = {Automated End-to-End Management of the Modeling Lifecycle in Deep Learning},
  author = {Gharibi, Gharib and Walunj, Vijay and Nekadi, Raju and Marri, Raj and Lee, Yugyung},
  date = {2021-03-01},
  journaltitle = {Empirical Software Engineering},
  volume = {26},
  number = {2},
  publisher = {{Springer}},
  issn = {15737616},
  doi = {10.1007/s10664-020-09894-9},
  abstract = {Deep learning has improved the state-of-the-art results in an ever-growing number of domains. This success heavily relies on the development and training of deep learning models–an experimental, iterative process that produces tens to hundreds of models before arriving at a satisfactory result. While there has been a surge in the number of tools and frameworks that aim at facilitating deep learning, the process of managing the models and their artifacts is still surprisingly challenging and time-consuming. Existing model-management solutions are either tailored for commercial platforms or require significant code changes. Moreover, most of the existing solutions address a single phase of the modeling lifecycle, such as experiment monitoring, while ignoring other essential tasks, such as model deployment. In this paper, we present a software system to facilitate and accelerate the deep learning lifecycle, named ModelKB. ModelKB can automatically manage the modeling lifecycle end-to-end, including (1) monitoring and tracking experiments; (2) visualizing, searching for, and comparing models and experiments; (3) deploying models locally and on the cloud; and (4) sharing and publishing trained models. Moreover, our system provides a stepping-stone for enhanced reproducibility. ModelKB currently supports TensorFlow 2.0, Keras, and PyTorch, and it can be extended to other deep learning frameworks easily.},
  keywords = {Data management,Deep learning,Software automation}
}

@inproceedings{ghotra2015revisiting,
  title = {Revisiting the Impact of Classification Techniques on the Performance of Defect Prediction Models},
  booktitle = {2015 {{IEEE}}/{{ACM}} 37th {{IEEE International Conference}} on {{Software Engineering}}},
  author = {Ghotra, Baljinder and McIntosh, Shane and Hassan, Ahmed E},
  date = {2015},
  volume = {1},
  pages = {789--800},
  publisher = {{IEEE}}
}

@article{Gilmer2018,
  title = {Adversarial Spheres},
  author = {Gilmer, Justin and Metz, Luke and Faghri, Fartash and Schoenholz, Samuel S. and Raghu, Maithra and Wattenberg, Martin and Goodfellow, Ian},
  date = {2018},
  journaltitle = {6th International Conference on Learning Representations, ICLR 2018 - Workshop Track Proceedings},
  eprint = {1801.02774},
  eprinttype = {arxiv},
  abstract = {State of the art computer vision models have been shown to be vulnerable to small adversarial perturbations of the input. In other words, most images in the data distribution are both correctly classified by the model and are very close to a visually similar misclassified image. Despite substantial research interest, the cause of the phenomenon is still poorly understood and remains unsolved. We hypothesize that this counter intuitive behavior is a naturally occurring result of the high dimensional geometry of the data manifold. As a first step towards exploring this hypothesis, we study a simple synthetic dataset of classifying between two concentric high dimensional spheres. For this dataset we show a fundamental tradeoff between the amount of test error and the average distance to nearest error. In particular, we prove that any model which misclassifies a small constant fraction of a sphere will be vulnerable to adversarial perturbations of size O(1/d). Surprisingly, when we train several different architectures on this dataset, all of their error sets naturally approach this theoretical bound. As a result of the theory, the vulnerability of machine learning models to small adversarial perturbations is a logical consequence of the amount of test error observed. We hope that our theoretical analysis of this very simple case will point the way forward to explore how the geometry of complex real-world data sets leads to adversarial examples.},
  file = {/Users/ryedida/Zotero/storage/VNBLISAX/Gilmer et al. - 2018 - Adversarial spheres(2).pdf}
}

@article{girayUseDeepLearning2023,
  title = {On the Use of Deep Learning in Software Defect Prediction},
  author = {Giray, Görkem and Bennin, Kwabena Ebo and Köksal, Ömer and Babur, Önder and Tekinerdogan, Bedir},
  date = {2023-01},
  journaltitle = {Journal of Systems and Software},
  shortjournal = {Journal of Systems and Software},
  volume = {195},
  pages = {111537},
  issn = {01641212},
  doi = {10.1016/j.jss.2022.111537},
  url = {https://linkinghub.elsevier.com/retrieve/pii/S0164121222002138},
  urldate = {2023-10-06},
  abstract = {Objective: The purpose of this study is to systematically identify, analyze, summarize, and synthesize the current state of the utilization of DL algorithms for SDP in the literature. Method: We systematically selected a pool of 102 peer-reviewed studies and then conducted a quantitative and qualitative analysis using the data extracted from these studies. Results: Main highlights include: (1) most studies applied supervised DL; (2) two third of the studies used metrics as an input to DL algorithms; (3) Convolutional Neural Network is the most frequently used DL algorithm. Conclusion: Based on our findings, we propose to (1) develop more comprehensive DL approaches that automatically capture the needed features; (2) use diverse software artifacts other than source code; (3) adopt data augmentation techniques to tackle the class imbalance problem; (4) publish replication packages.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/5Q8XY3KX/Giray et al. - 2023 - On the use of deep learning in software defect pre.pdf}
}

@unpublished{Goodfellow,
  title = {Generative {{Adversarial Nets}}},
  author = {Goodfellow, Ian J and Pouget-abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-farley, David},
  eprint = {1406.2661v1},
  eprinttype = {arxiv},
  pages = {1--9},
  file = {/Users/ryedida/Zotero/storage/ZZL3FN6S/Goodfellow et al. - Unknown - Generative Adversarial Nets(2).pdf}
}

@article{Goodfellow2015,
  title = {Explaining and Harnessing Adversarial Examples},
  author = {Goodfellow, Ian J. and Shlens, Jonathon and Szegedy, Christian},
  date = {2015},
  journaltitle = {3rd International Conference on Learning Representations, ICLR 2015 - Conference Track Proceedings},
  eprint = {1412.6572},
  eprinttype = {arxiv},
  pages = {1--11},
  abstract = {Several machine learning models, including neural networks, consistently misclassify adversarial examples—inputs formed by applying small but intentionally worst-case perturbations to examples from the dataset, such that the perturbed input results in the model outputting an incorrect answer with high confidence. Early attempts at explaining this phenomenon focused on nonlinearity and overfitting. We argue instead that the primary cause of neural networks’ vulnerability to adversarial perturbation is their linear nature. This explanation is supported by new quantitative results while giving the first explanation of the most intriguing fact about them: their generalization across architectures and training sets. Moreover, this view yields a simple and fast method of generating adversarial examples. Using this approach to provide examples for adversarial training, we reduce the test set error of a maxout network on the MNIST dataset.},
  file = {/Users/ryedida/Zotero/storage/7W9BAC3X/Goodfellow, Shlens, Szegedy - 2015 - Explaining and harnessing adversarial examples(2).pdf}
}

@inproceedings{gopal2022peer,
  title = {Peer Instruction in Online Software Testing and Continuous Integration: {{A}} Replication Study},
  booktitle = {Proceedings of the {{ACM}}/{{IEEE}} 44th International Conference on Software Engineering: {{Software}} Engineering Education and Training},
  author = {Gopal, Bhuvaneswari and Cooper, Stephen},
  date = {2022},
  pages = {199--204}
}

@unpublished{Gopinath2017,
  title = {{{DeepSafe}}: {{A Data-driven Approach}} for {{Checking Adversarial Robustness}} in {{Neural Networks}}},
  author = {Gopinath, Divya and Katz, Guy and Pasareanu, Corina S. and Barrett, Clark},
  date = {2017},
  eprint = {1710.00486},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1710.00486},
  abstract = {Deep neural networks have become widely used, obtaining remarkable results in domains such as computer vision, speech recognition, natural language processing, audio recognition, social network filtering, machine translation, and bio-informatics, where they have produced results comparable to human experts. However, these networks can be easily fooled by adversarial perturbations: minimal changes to correctly-classified inputs, that cause the network to mis-classify them. This phenomenon represents a concern for both safety and security, but it is currently unclear how to measure a network's robustness against such perturbations. Existing techniques are limited to checking robustness around a few individual input points, providing only very limited guarantees. We propose a novel approach for automatically identifying safe regions of the input space, within which the network is robust against adversarial perturbations. The approach is data-guided, relying on clustering to identify well-defined geometric regions as candidate safe regions. We then utilize verification techniques to confirm that these regions are safe or to provide counter-examples showing that they are not safe. We also introduce the notion of targeted robustness which, for a given target label and region, ensures that a NN does not map any input in the region to the target label. We evaluated our technique on the MNIST dataset and on a neural network implementation of a controller for the next-generation Airborne Collision Avoidance System for unmanned aircraft (ACAS Xu). For these networks, our approach identified multiple regions which were completely safe as well as some which were only safe for specific labels. It also discovered several adversarial perturbations of interest.},
  file = {/Users/ryedida/Zotero/storage/XNPKGNMR/Gopinath et al. - 2017 - DeepSafe A Data-driven Approach for Checking Adversarial Robustness in Neural Networks(2).pdf}
}

@article{goukRegularisationNeuralNetworks2021,
  title = {Regularisation of Neural Networks by Enforcing {{Lipschitz}} Continuity},
  author = {Gouk, Henry and Frank, Eibe and Pfahringer, · Bernhard and Michael, · and Cree, J},
  date = {2021},
  volume = {110},
  pages = {393--416},
  doi = {10.1007/s10994-020-05929-w},
  url = {https://doi.org/10.1007/s10994-020-05929-w},
  abstract = {We investigate the effect of explicitly enforcing the Lipschitz continuity of neural networks with respect to their inputs. To this end, we provide a simple technique for computing an upper bound to the Lipschitz constant-for multiple p-norms-of a feed forward neural network composed of commonly used layer types. Our technique is then used to formulate training a neural network with a bounded Lipschitz constant as a constrained optimisation problem that can be solved using projected stochastic gradient methods. Our evaluation study shows that the performance of the resulting models exceeds that of models trained with other common regularisers. We also provide evidence that the hyperparameters are intuitive to tune, demonstrate how the choice of norm for computing the Lipschitz constant impacts the resulting model, and show that the performance gains provided by our method are particularly noticeable when only a small amount of training data is available.},
  isbn = {0123456789},
  keywords = {Lipschitz continuity,Neural networks,Regularisation}
}

@article{GroundTruthAdversarial,
  title = {Ground {{Truth Adversarial Examples}}},
  isbn = {0911008779143},
  file = {/Users/ryedida/Zotero/storage/658A9JDD/Unknown - Unknown - Ground Truth Adversarial Examples(2).pdf}
}

@book{grunwaldMinimumDescriptionLength2007,
  title = {The Minimum Description Length Principle},
  author = {Grünwald, Peter D.},
  date = {2007},
  publisher = {{MIT press}},
  isbn = {0-262-07281-5}
}

@unpublished{Guo2016,
  title = {Entity {{Embeddings}} of {{Categorical Variables}}},
  author = {Guo, Cheng and Berkhahn, Felix},
  date = {2016},
  number = {1},
  eprint = {1604.06737},
  eprinttype = {arxiv},
  pages = {1--9},
  url = {http://arxiv.org/abs/1604.06737},
  abstract = {We map categorical variables in a function approximation problem into Euclidean spaces, which are the entity embeddings of the categorical variables. The mapping is learned by a neural network during the standard supervised training process. Entity embedding not only reduces memory usage and speeds up neural networks compared with one-hot encoding, but more importantly by mapping similar values close to each other in the embedding space it reveals the intrinsic properties of the categorical variables. We applied it successfully in a recent Kaggle competition and were able to reach the third position with relative simple features. We further demonstrate in this paper that entity embedding helps the neural network to generalize better when the data is sparse and statistics is unknown. Thus it is especially useful for datasets with lots of high cardinality features, where other methods tend to overfit. We also demonstrate that the embeddings obtained from the trained neural network boost the performance of all tested machine learning methods considerably when used as the input features instead. As entity embedding defines a distance measure for categorical variables it can be used for visualizing categorical data and for data clustering.}
}

@article{Guo2018,
  title = {Countering Adversarial Images Using Input Transformations},
  author = {Guo, Chuan and Rana, Mayank and Cissé, Moustapha and Van Der Maaten, Laurens},
  date = {2018},
  journaltitle = {6th International Conference on Learning Representations, ICLR 2018 - Conference Track Proceedings},
  number = {1},
  eprint = {1711.00117},
  eprinttype = {arxiv},
  pages = {1--12},
  abstract = {This paper investigates strategies that defend against adversarial-example attacks on image-classification systems by transforming the inputs before feeding them to the system. Specifically, we study applying image transformations such as bit-depth reduction, JPEG compression, total variance minimization, and image quilting before feeding the image to a convolutional network classifier. Our experiments on ImageNet show that total variance minimization and image quilting are very effective defenses in practice, in particular, when the network is trained on transformed images. The strength of those defenses lies in their non-differentiable nature and their inherent randomness, which makes it difficult for an adversary to circumvent the defenses. Our best defense eliminates 60\% of strong gray-box and 90\% of strong black-box attacks by a variety of major attack methods.},
  file = {/Users/ryedida/Zotero/storage/HWT6MRT8/Guo et al. - 2018 - Countering adversarial images using input transformations(2).pdf}
}

@online{guoUniXcoderUnifiedCrossModal2022,
  title = {{{UniXcoder}}: {{Unified Cross-Modal Pre-training}} for {{Code Representation}}},
  shorttitle = {{{UniXcoder}}},
  author = {Guo, Daya and Lu, Shuai and Duan, Nan and Wang, Yanlin and Zhou, Ming and Yin, Jian},
  date = {2022-03-07},
  eprint = {2203.03850},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2203.03850},
  urldate = {2023-10-06},
  abstract = {Pre-trained models for programming languages have recently demonstrated great success on code intelligence. To support both code-related understanding and generation tasks, recent works attempt to pre-train unified encoder-decoder models. However, such encoder-decoder framework is sub-optimal for auto-regressive tasks, especially code completion that requires a decoder-only manner for efficient inference. In this paper, we present UniXcoder, a unified cross-modal pre-trained model for programming language. The model utilizes mask attention matrices with prefix adapters to control the behavior of the model and leverages cross-modal contents like AST and code comment to enhance code representation. To encode AST that is represented as a tree in parallel, we propose a one-to-one mapping method to transform AST in a sequence structure that retains all structural information from the tree. Furthermore, we propose to utilize multi-modal contents to learn representation of code fragment with contrastive learning, and then align representations among programming languages using a cross-modal generation task. We evaluate UniXcoder on five code-related tasks over nine datasets. To further evaluate the performance of code fragment representation, we also construct a dataset for a new task, called zero-shot code-to-code search. Results show that our model achieves state-of-the-art performance on most tasks and analysis reveals that comment and AST can both enhance UniXcoder.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Computation and Language,Computer Science - Programming Languages,Computer Science - Software Engineering},
  file = {/Users/ryedida/Zotero/storage/NHD5JXK7/Guo et al. - 2022 - UniXcoder Unified Cross-Modal Pre-training for Co.pdf}
}

@article{hameedFarmConsumerFactors2018,
  title = {Farm to {{Consumer}}: {{Factors Affecting}} the {{Organoleptic Characteristics}} of {{Coffee}}. {{II}}: {{Postharvest Processing Factors}}},
  shorttitle = {Farm to {{Consumer}}},
  author = {Hameed, Ahsan and Hussain, Syed Ammar and Ijaz, Muhammad Umair and Ullah, Samee and Pasha, Imran and Suleria, Hafiz Ansar Rasul},
  date = {2018},
  journaltitle = {Comprehensive Reviews in Food Science and Food Safety},
  volume = {17},
  number = {5},
  pages = {1184--1237},
  issn = {1541-4337},
  doi = {10.1111/1541-4337.12365},
  url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/1541-4337.12365},
  urldate = {2023-09-20},
  abstract = {The production and consumption of coffee are increasing despite the roadblocks to its agriculture and global trade. The unique, refreshing, and stimulating final cupping quality of coffee is the only reason for this rising production and consumption. Coffee quality is a multifaceted trait and is inevitably influenced by the way it is successively processed after harvesting. Reportedly, 60\% of the quality attributes of coffee are governed by postharvest processing. The current review elaborates and establishes for the first time the relationship between different methods of postharvest processing of coffee and its varying organoleptic and sensory quality attributes. In view of the proven significance of each processing step, this review has been subdivided into three sections, secondary processing, primary processing, and postprocessing variables. Secondary processing addresses the immediate processing steps on the farm after harvest and storage before roasting. The primary processing section adheres specifically to roasting, grinding and brewing/extraction, topics which have been technically addressed more than any others in the literature and by industry. The postprocessing attribute section deals generally with interaction of the consumer with products of different visual appearance. Finally, there are still some bottlenecks which need to be addressed, not only to completely understand the relationship of varying postharvest processing methods with varying in-cup quality attributes, but also to devise the next generation of coffee processing technologies.},
  langid = {english},
  keywords = {coffee processing,cup quality,organoleptic characteristics,postharvest processing,sensory attributes},
  file = {/Users/ryedida/Zotero/storage/7Q83WZ3D/Hameed et al_2018_Farm to Consumer.pdf;/Users/ryedida/Zotero/storage/9YBBMEHC/1541-4337.html}
}

@report{hamiltonInductiveRepresentationLearning,
  title = {Inductive {{Representation Learning}} on {{Large Graphs}}},
  author = {Hamilton, William L and Ying, Rex and Leskovec, Jure},
  abstract = {Low-dimensional embeddings of nodes in large graphs have proved extremely useful in a variety of prediction tasks, from content recommendation to identifying protein functions. However, most existing approaches require that all nodes in the graph are present during training of the embeddings; these previous approaches are inherently transductive and do not naturally generalize to unseen nodes. Here we present GraphSAGE, a general inductive framework that leverages node feature information (e.g., text attributes) to efficiently generate node embeddings for previously unseen data. Instead of training individual embeddings for each node, we learn a function that generates embeddings by sampling and aggregating features from a node's local neighborhood. Our algorithm outperforms strong baselines on three inductive node-classification benchmarks: we classify the category of unseen nodes in evolving information graphs based on citation and Reddit post data, and we show that our algorithm generalizes to completely unseen graphs using a multi-graph dataset of protein-protein interactions.}
}

@article{Hammond2018,
  title = {Perceived Attitudes about Substance Use in Anonymous Social Media Posts near College Campuses: {{Observational}} Study},
  author = {Hammond, Alexis S. and Paul, Michael J. and Hobelmann, Joseph and Koratana, Animesh R. and Dredze, Mark and Chisolm, Margaret S.},
  date = {2018},
  journaltitle = {Journal of Medical Internet Research},
  volume = {20},
  number = {8},
  pages = {1--7},
  issn = {14388871},
  doi = {10.2196/mental.9903},
  keywords = {Alcohol,College,Drugs,Social media,Substance,Yik Yak},
  file = {/Users/ryedida/Zotero/storage/DQUYJ3AG/Hammond et al. - 2018 - Perceived attitudes about substance use in anonymous social media posts near college campuses Observational s(2).pdf}
}

@unpublished{hanDeepCompressionCompressing2016,
  title = {Deep {{Compression}}: {{Compressing Deep Neural Networks}} with {{Pruning}}, {{Trained Quantization}} and {{Huffman Coding}}},
  shorttitle = {Deep {{Compression}}},
  author = {Han, Song and Mao, Huizi and Dally, William J.},
  date = {2016-02-15},
  eprint = {1510.00149},
  eprinttype = {arxiv},
  eprintclass = {cs},
  publisher = {{arXiv}},
  url = {http://arxiv.org/abs/1510.00149},
  urldate = {2022-08-07},
  abstract = {Neural networks are both computationally intensive and memory intensive, making them difficult to deploy on embedded systems with limited hardware resources. To address this limitation, we introduce “deep compression”, a three stage pipeline: pruning, trained quantization and Huffman coding, that work together to reduce the storage requirement of neural networks by 35× to 49× without affecting their accuracy. Our method first prunes the network by learning only the important connections. Next, we quantize the weights to enforce weight sharing, finally, we apply Huffman coding. After the first two steps we retrain the network to fine tune the remaining connections and the quantized centroids. Pruning, reduces the number of connections by 9× to 13×; Quantization then reduces the number of bits that represent each connection from 32 to 5. On the ImageNet dataset, our method reduced the storage required by AlexNet by 35×, from 240MB to 6.9MB, without loss of accuracy. Our method reduced the size of VGG-16 by 49× from 552MB to 11.3MB, again with no loss of accuracy. This allows fitting the model into on-chip SRAM cache rather than off-chip DRAM memory. Our compression method also facilitates the use of complex neural networks in mobile applications where application size and download bandwidth are constrained. Benchmarked on CPU, GPU and mobile GPU, compressed network has 3× to 4× layerwise speedup and 3× to 7× better energy efficiency.},
  langid = {english},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Neural and Evolutionary Computing},
  annotation = {5757 citations (Semantic Scholar/arXiv) [2022-08-06]},
  file = {/Users/ryedida/Zotero/storage/PU9IR8J5/Han et al. - 2016 - Deep Compression Compressing Deep Neural Networks.pdf}
}

@article{hannekeTheoryDisagreementBasedActive2014,
  title = {Theory of {{Disagreement-Based Active Learning}}},
  author = {Hanneke, Steve},
  date = {2014-06-11},
  journaltitle = {Foundations and Trends® in Machine Learning},
  shortjournal = {MAL},
  volume = {7},
  number = {2-3},
  pages = {131--309},
  publisher = {{Now Publishers, Inc.}},
  issn = {1935-8237, 1935-8245},
  doi = {10.1561/2200000037},
  url = {https://www.nowpublishers.com/article/Details/MAL-037},
  urldate = {2023-11-20},
  abstract = {Theory of Disagreement-Based Active Learning},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/JYGU9HML/Hanneke_2014_Theory of Disagreement-Based Active Learning.pdf}
}

@report{hardtTrainFasterGeneralize2016,
  title = {Train Faster, Generalize Better: {{Stability}} of Stochastic Gradient Descent},
  author = {Hardt, Moritz and Recht, Benjamin and Singer, Yoram},
  date = {2016},
  eprint = {1509.01240v2},
  eprinttype = {arxiv},
  abstract = {We show that parametric models trained by a stochastic gradient method (SGM) with few iterations have vanishing generalization error. We prove our results by arguing that SGM is algorithmically stable in the sense of Bousquet and Elisseeff. Our analysis only employs elementary tools from convex and continuous optimization. We derive stability bounds for both convex and non-convex optimization under standard Lipschitz and smoothness assumptions. Applying our results to the convex case, we provide new insights for why multiple epochs of stochastic gradient methods generalize well in practice. In the non-convex case, we give a new interpretation of common practices in neural networks, and formally show that popular techniques for training large deep models are indeed stability-promoting. Our findings conceptually underscore the importance of reducing training time beyond its obvious benefit.},
  file = {/Users/ryedida/Zotero/storage/HXDYHVK4/Hardt et al_2016_Train faster, generalize better.pdf}
}

@inproceedings{harveyNearlytightVCdimensionBounds2017,
  title = {Nearly-Tight {{VC-dimension}} Bounds for Piecewise Linear Neural Networks},
  booktitle = {Conference on Learning Theory},
  author = {Harvey, Nick and Liaw, Christopher and Mehrabian, Abbas},
  date = {2017},
  pages = {1064--1068},
  publisher = {{PMLR}},
  isbn = {2640-3498},
  file = {/Users/ryedida/Zotero/storage/88UYX3GY/Harvey et al_2017_Nearly-tight VC-dimension bounds for piecewise linear neural networks.pdf}
}

@unpublished{Harwath2018,
  title = {Jointly {{Discovering Visual Objects}} and {{Spoken Words}} from {{Raw Sensory Input}}},
  author = {Harwath, David and Recasens, Adrià and Surís, Dídac and Chuang, Galen and Torralba, Antonio and Glass, James},
  date = {2018},
  eprint = {1804.01452},
  eprinttype = {arxiv},
  abstract = {In this paper, we explore neural network models that learn to associate segments of spoken audio captions with the semantically relevant portions of natural images that they refer to. We demonstrate that these audio-visual associative localizations emerge from network-internal representations learned as a by-product of training to perform an image-audio retrieval task. Our models operate directly on the image pixels and speech waveform, and do not rely on any conventional supervision in the form of labels, segmentations, or alignments between the modalities during training. We perform analysis using the Places 205 and ADE20k datasets demonstrating that our models implicitly learn semantically-coupled object and word detectors.},
  file = {/Users/ryedida/Zotero/storage/GZELI87H/Harwath et al. - 2018 - Jointly Discovering Visual Objects and Spoken Words from Raw Sensory Input(2).pdf}
}

@online{hazanIntroductionOnlineConvex2023,
  title = {Introduction to {{Online Convex Optimization}}},
  author = {Hazan, Elad},
  date = {2023-08-06},
  eprint = {1909.05207},
  eprinttype = {arxiv},
  eprintclass = {cs, math, stat},
  url = {http://arxiv.org/abs/1909.05207},
  urldate = {2023-12-06},
  abstract = {This manuscript portrays optimization as a process. In many practical applications the environment is so complex that it is infeasible to lay out a comprehensive theoretical model and use classical algorithmic theory and mathematical optimization. It is necessary as well as beneficial to take a robust approach, by applying an optimization method that learns as one goes along, learning from experience as more aspects of the problem are observed. This view of optimization as a process has become prominent in varied fields and has led to some spectacular success in modeling and systems that are now part of our daily lives.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning,Mathematics - Optimization and Control,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/5WG65PG2/Hazan - 2023 - Introduction to Online Convex Optimization.pdf}
}

@online{hazardNativelyInterpretableMachine2019,
  title = {Natively {{Interpretable Machine Learning}} and {{Artificial Intelligence}}: {{Preliminary Results}} and {{Future Directions}}},
  shorttitle = {Natively {{Interpretable Machine Learning}} and {{Artificial Intelligence}}},
  author = {Hazard, Christopher J. and Fusting, Christopher and Resnick, Michael and Auerbach, Michael and Meehan, Michael and Korobov, Valeri},
  date = {2019-01-18},
  eprint = {1901.00246},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1901.00246},
  urldate = {2023-11-14},
  abstract = {Machine learning models have become more and more complex in order to better approximate complex functions. Although fruitful in many domains, the added complexity has come at the cost of model interpretability. The once popular k-nearest neighbors (kNN) approach, which finds and uses the most similar data for reasoning, has received much less attention in recent decades due to numerous problems when compared to other techniques. We show that many of these historical problems with kNN can be overcome, and our contribution has applications not only in machine learning but also in online learning, data synthesis, anomaly detection, model compression, and reinforcement learning, without sacrificing interpretability. We introduce a synthesis between kNN and information theory that we hope will provide a clear path towards models that are innately interpretable and auditable. Through this work we hope to gather interest in combining kNN with information theory as a promising path to fully auditable machine learning and artificial intelligence.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/9WR8FGCE/Hazard et al. - 2019 - Natively Interpretable Machine Learning and Artifi.pdf}
}

@unpublished{He,
  title = {Spatial {{Pyramid Pooling}} in {{Deep Convolutional Networks}} for {{Visual Recognition}}},
  author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  eprint = {1406.4729v4},
  eprinttype = {arxiv},
  pages = {1--14},
  file = {/Users/ryedida/Zotero/storage/BIAX5ISX/He et al. - Unknown - Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition(2).pdf}
}

@inproceedings{he2016deep,
  title = {Deep Residual Learning for Image Recognition},
  booktitle = {Proceedings of the {{IEEE}} Conference on Computer Vision and Pattern Recognition},
  author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  date = {2016},
  pages = {770--778}
}

@article{Hea,
  title = {Delving {{Deep}} into {{Rectifiers}}: {{Surpassing Human-Level Performance}} on {{ImageNet Classification}}},
  author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  issn = {10902104},
  isbn = {9781467383912},
  file = {/Users/ryedida/Zotero/storage/QAAXDRVZ/He et al. - Unknown - Delving Deep into Rectifiers Surpassing Human-Level Performance on ImageNet Classification(2).pdf}
}

@article{heAdversarialExampleDefenses,
  title = {Adversarial {{Example Defenses}}: {{Ensembles}} of {{Weak Defenses}} Are Not {{Strong}}},
  author = {He, Warren and Wei, James and Chen, Xinyun and Carlini, Nicholas and Song, Dawn},
  abstract = {Ongoing research has proposed several methods to defend neural networks against adversarial examples, many of which researchers have shown to be ineffective. We ask whether a strong defense can be created by combining multiple (possibly weak) defenses. To answer this question, we study three defenses that follow this approach. Two of these are recently proposed defenses that intentionally combine components designed to work well together. A third defense combines three independent defenses. For all the components of these defenses and the combined defenses themselves, we show that an adaptive adversary can create adversarial examples successfully with low distortion. Thus, our work implies that ensemble of weak defenses is not sufficient to provide strong defense against adversarial examples.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/VG7ZB9JL/He et al. - Adversarial Example Defenses Ensembles of Weak De.pdf}
}

@unpublished{heinFormalGuaranteesRobustness2017,
  title = {Formal {{Guarantees}} on the {{Robustness}} of a {{Classifier}} against {{Adversarial Manipulation}}},
  author = {Hein, Matthias and Andriushchenko, Maksym},
  date = {2017-11-05},
  eprint = {1705.08475},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1705.08475},
  urldate = {2021-03-27},
  abstract = {Recent work has shown that state-of-the-art classifiers are quite brittle, in the sense that a small adversarial change of an originally with high confidence correctly classified input leads to a wrong classification again with high confidence. This raises concerns that such classifiers are vulnerable to attacks and calls into question their usage in safety-critical systems. We show in this paper for the first time formal guarantees on the robustness of a classifier by giving instance-specific lower bounds on the norm of the input manipulation required to change the classifier decision. Based on this analysis we propose the Cross-Lipschitz regularization functional. We show that using this form of regularization in kernel methods resp. neural networks improves the robustness of the classifier with no or small loss in prediction performance.},
  langid = {english},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/IUD9XU3U/Hein and Andriushchenko - 2017 - Formal Guarantees on the Robustness of a Classifie.pdf}
}

@inproceedings{heinWhyReLUNetworks2019,
  title = {Why {{ReLU Networks Yield High-Confidence Predictions Far Away From}} the {{Training Data}} and {{How}} to {{Mitigate}} the {{Problem}}},
  booktitle = {2019 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  author = {Hein, Matthias and Andriushchenko, Maksym and Bitterwolf, Julian},
  date = {2019-06},
  pages = {41--50},
  publisher = {{IEEE}},
  location = {{Long Beach, CA, USA}},
  doi = {10.1109/CVPR.2019.00013},
  url = {https://ieeexplore.ieee.org/document/8953721/},
  urldate = {2021-04-22},
  abstract = {Classifiers used in the wild, in particular for safetycritical systems, should know when they don’t know, in particular make low confidence predictions far away from the training data. We show that ReLU type neural networks fail in this regard as they produce almost always high confidence predictions far away from the training data. For bounded domains we propose a new robust optimization technique similar to adversarial training which enforces low confidence predictions far away from the training data. We show that this technique is surprisingly effective in reducing the confidence of predictions far away from the training data while maintaining high confidence predictions and test error on the original classification task compared to standard training. This is a short version of the corresponding CVPR paper.},
  eventtitle = {2019 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  isbn = {978-1-72813-293-8},
  langid = {english},
  annotation = {108 citations (Semantic Scholar/DOI) [2021-04-22]},
  file = {/Users/ryedida/Zotero/storage/FCMYKY56/Hein et al. - 2019 - Why ReLU Networks Yield High-Confidence Prediction.pdf}
}

@unpublished{Hendrycks2019,
  title = {Natural {{Adversarial Examples}}},
  author = {Hendrycks, Dan and Zhao, Kevin and Basart, Steven and Steinhardt, Jacob and Song, Dawn},
  date = {2019},
  eprint = {1907.07174},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1907.07174},
  abstract = {We introduce natural adversarial examples -- real-world, unmodified, and naturally occurring examples that cause classifier accuracy to significantly degrade. We curate 7,500 natural adversarial examples and release them in an ImageNet classifier test set that we call ImageNet-A. This dataset serves as a new way to measure classifier robustness. Like l\_p adversarial examples, ImageNet-A examples successfully transfer to unseen or black-box classifiers. For example, on ImageNet-A a DenseNet-121 obtains around 2\% accuracy, an accuracy drop of approximately 90\%. Recovering this accuracy is not simple because ImageNet-A examples exploit deep flaws in current classifiers including their over-reliance on color, texture, and background cues. We observe that popular training techniques for improving robustness have little effect, but we show that some architectural changes can enhance robustness to natural adversarial examples. Future research is required to enable robust generalization to this hard ImageNet test set.},
  file = {/Users/ryedida/Zotero/storage/W2874SRK/Hendrycks et al. - 2019 - Natural Adversarial Examples(2).pdf}
}

@article{hernandez2014predictive,
  title = {Predictive Entropy Search for Efficient Global Optimization of Black-Box Functions},
  author = {Hernández-Lobato, José Miguel and Hoffman, Matthew W and Ghahramani, Zoubin},
  date = {2014},
  journaltitle = {Advances in neural information processing systems},
  volume = {27}
}

@article{Hertz2013,
  title = {Investigating Factors of Student Learning in Introductory Courses},
  author = {Hertz, Matthew and Ford, Sarah Michele},
  date = {2013},
  journaltitle = {ACM technical symposium on Computer science education},
  pages = {195},
  doi = {10.1145/2445196.2445254},
  url = {http://dl.acm.org/citation.cfm?doid=2445196.2445254},
  abstract = {Instructors of the introductory computer science courses, commonly called " CS1 " and " CS2 " , face a large number of choices when de-signing their classes. Instructors have available to them a multitude of ways to explain each topic as well as course-wide choices such as objects-first or objects-late or using a functional or procedural lan-guage. Understanding how these options can affect student learning would help simplify these decisions. Unfortunately, just comparing how well students perform may not be accurate as it ignores the many confounding factors that could also have made a difference. To get beyond that problem, this study investigates underlying factors that affect student learning. Using a survey of instructors, we find that students' abilities are nearly always correlated with the importance that the instructor placed on a particular topic. Our results also highlight several " hard " topics for which student mastery and topic importance were not correlated in CS1 and only weakly correlated in CS2. While one might expect the time spent covering a topic in class to also be correlated with student mastery, we find little evidence of this. In fact, for some basic programming concepts, we document negative correlations between instructional time and learning. We discuss how instructors can use these results when organizing their courses and how the computer science education community can use this finding of " hard " topics to focus their efforts.},
  isbn = {9781450318686},
  keywords = {all or part of,cs1,cs2,curriculum design,is granted without fee,or hard copies of,permission to make digital,personal or classroom use,provided that copies are,survey,this work for},
  file = {/Users/ryedida/Zotero/storage/CXWG2DE3/Hertz, Ford - 2013 - Investigating factors of student learning in introductory courses(2).pdf}
}

@article{Hertz2013a,
  title = {Trace-Based Teaching in Early Programming Courses},
  author = {Hertz, Matthew and Jump, Maria},
  date = {2013},
  journaltitle = {44th ACM technical symposium on Computer science education},
  eprint = {19345263},
  eprinttype = {pmid},
  pages = {561--566},
  issn = {19368798},
  doi = {10.1145/2445196.2445364},
  url = {http://dl.acm.org/citation.cfm?doid=2445196.2445364},
  abstract = {Students in introductory programming courses struggle with building the mental models that correctly describe concepts such as variables, subroutine calls, and dynamic memory usage. This struggle leads to lowered student learning outcomes and, it has been argued, the high failure and dropout rates commonly seen in these courses. We will show that accurately modeling what is occurring in memory and requiring students to trace code using this model improves student performance and increases retention. This paper presents the results of an experiment in which introductory programming courses were organized around code tracing. We present program memory traces, a new approach for tracing code that models what occurs in memory as a program executes. We use these traces to drive our lectures and to act as key pieces of our active learning activities. We report the results of student surveys showing that instructor tracing was rated as the most valuable piece of the course and students' overwhelming agreement on the importance of the tracing activities for their learning. Finally, we demonstrate that trace-based teaching led to statistically significant improvements student grades, decreased drop and failure rates, and an improvement in students' programming abilities.},
  isbn = {978-1-4503-1868-6},
  keywords = {cs1,cs2,pedagogy,tracing},
  file = {/Users/ryedida/Zotero/storage/A779J2MZ/Hertz, Jump - 2013 - Trace-based teaching in early programming courses(2).pdf}
}

@online{hinLineVDStatementlevelVulnerability2022,
  title = {{{LineVD}}: {{Statement-level Vulnerability Detection}} Using {{Graph Neural Networks}}},
  shorttitle = {{{LineVD}}},
  author = {Hin, David and Kan, Andrey and Chen, Huaming and Babar, M. Ali},
  date = {2022-03-25},
  eprint = {2203.05181},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2203.05181},
  urldate = {2023-10-06},
  abstract = {Current machine-learning based software vulnerability detection methods are primarily conducted at the function-level. However, a key limitation of these methods is that they do not indicate the specific lines of code contributing to vulnerabilities. This limits the ability of developers to efficiently inspect and interpret the predictions from a learnt model, which is crucial for integrating machine-learning based tools into the software development workflow. Graph-based models have shown promising performance in function-level vulnerability detection, but their capability for statement-level vulnerability detection has not been extensively explored. While interpreting function-level predictions through explainable AI is one promising direction, we herein consider the statement-level software vulnerability detection task from a fully supervised learning perspective. We propose a novel deep learning framework, LineVD, which formulates statement-level vulnerability detection as a node classification task. LineVD leverages control and data dependencies between statements using graph neural networks, and a transformer-based model to encode the raw source code tokens. In particular, by addressing the conflicting outputs between function-level and statement-level information, LineVD significantly improve the prediction performance without vulnerability status for function code. We have conducted extensive experiments against a large-scale collection of real-world C/C++ vulnerabilities obtained from multiple real-world projects, and demonstrate an increase of 105\% in F1-score over the current state-of-the-art.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Cryptography and Security,Computer Science - Software Engineering},
  file = {/Users/ryedida/Zotero/storage/UHXPPKWE/Hin et al. - 2022 - LineVD Statement-level Vulnerability Detection us.pdf}
}

@article{hinton2009deep,
  title = {Deep Belief Networks},
  author = {Hinton, Geoffrey E},
  date = {2009},
  journaltitle = {Scholarpedia},
  volume = {4},
  number = {5},
  pages = {5947}
}

@unpublished{Hinton2015,
  title = {Distilling the {{Knowledge}} in a {{Neural Network}}},
  author = {Hinton, Geoffrey and Vinyals, Oriol and Dean, Jeff},
  date = {2015},
  eprint = {1503.02531},
  eprinttype = {arxiv},
  pages = {1--9},
  url = {http://arxiv.org/abs/1503.02531},
  abstract = {A very simple way to improve the performance of almost any machine learning algorithm is to train many different models on the same data and then to average their predictions. Unfortunately, making predictions using a whole ensemble of models is cumbersome and may be too computationally expensive to allow deployment to a large number of users, especially if the individual models are large neural nets. Caruana and his collaborators have shown that it is possible to compress the knowledge in an ensemble into a single model which is much easier to deploy and we develop this approach further using a different compression technique. We achieve some surprising results on MNIST and we show that we can significantly improve the acoustic model of a heavily used commercial system by distilling the knowledge in an ensemble of models into a single model. We also introduce a new type of ensemble composed of one or more full models and many specialist models which learn to distinguish fine-grained classes that the full models confuse. Unlike a mixture of experts, these specialist models can be trained rapidly and in parallel.},
  file = {/Users/ryedida/Zotero/storage/QVNSDYRY/Hinton, Vinyals, Dean - 2015 - Distilling the Knowledge in a Neural Network(2).pdf}
}

@article{hintonKeepingNeuralNetworks,
  title = {Keeping {{Neural Networks Simple}} by {{Minimizing}} the {{Description Length}} of the {{Weights}}},
  author = {Hinton, E},
  abstract = {Supervised neural networks generalize well if there is much less information in the weights than there is in the output vectors of the training cases. So during learning, it is important to keep the weights simple by penalizing the amount of information they contain. The amount of information in a weight can be controlled by adding Gaussian noise and the noise level can be adapted during learning to optimize the trade-off between the expected squared error of the network and the amount of information in the weights. We describe a method of computing the derivatives of the expected squared error and of the amount of information in the noisy weights in a network that contains a layer of non-linear hidden units. Provided the output units are linear, the exact derivatives can be computed efficiently without time-consuming Monte Carlo simulations. The idea of minimizing the amount of information that is required to communicate the weights of a neural network leads to a number of intereating schemes for encoding the weights.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/WE7JRZEK/Hinton - Keeping Neural Networks Simple by Minimizing the D.pdf}
}

@misc{hiranandaniOptimizingBlackboxMetrics2021,
  title = {Optimizing {{Black-box Metrics}} with {{Iterative Example Weighting}}},
  author = {Hiranandani, Gaurush and Mathur, Jatin and Koyejo, Oluwasanmi and Fard, Mahdi Milani and Narasimhan, Harikrishna},
  date = {2021},
  keywords = {hiran2021optimizing}
}

@inproceedings{hoang2019deepjit,
  title = {{{DeepJIT}}: An End-to-End Deep Learning Framework for Just-in-Time Defect Prediction},
  booktitle = {2019 {{IEEE}}/{{ACM}} 16th {{International Conference}} on {{Mining Software Repositories}} ({{MSR}})},
  author = {Hoang, Thong and Dam, Hoa Khanh and Kamei, Yasutaka and Lo, David and Ubayashi, Naoyasu},
  date = {2019},
  pages = {34--45},
  publisher = {{IEEE}}
}

@article{hochreiterFlatMinima1997,
  title = {Flat Minima},
  author = {Hochreiter, Sepp and Schmidhuber, Jürgen},
  date = {1997},
  journaltitle = {Neural computation},
  volume = {9},
  number = {1},
  pages = {1--42},
  publisher = {{MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…}},
  isbn = {0899-7667},
  file = {/Users/ryedida/Zotero/storage/5YLQAPJ9/Hochreiter_Schmidhuber_1997_Flat minima.pdf}
}

@article{hochreiterSIMPLIFYINGNEURALNETS,
  title = {{{SIMPLIFYING NEURAL NETS BY DISCOVERING FLAT MINIMA}}},
  author = {Hochreiter, Sepp and Schmidhuber, Jürgen},
  abstract = {We present a new algorithm for finding low complexity networks with high generalization capability. The algorithm searches for large connected regions of so-called ''fiat'' minima of the error function. In the weight-space environment of a "flat" minimum, the error remains approximately constant. Using an MDL-based argument, flat minima can be shown to correspond to low expected overfitting. Although our algorithm requires the computation of second order derivatives, it has backprop's order of complexity. Experiments with feedforward and recurrent nets are described. In an application to stock market prediction, the method outperforms conventional backprop, weight decay, and "optimal brain surgeon" .},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/LLEITKB7/Hochreiter and Schmidhuber - SIMPLIFYING NEURAL NETS BY DISCOVERING FLAT MINIMA.pdf}
}

@article{Hoegen2018,
  title = {The Impact of Agent Facial Mimicry on Social Behavior in a Prisoner's Dilemma},
  author = {Hoegen, Rens and family=Schalk, given=Job, prefix=van der, useprefix=true and Lucas, Gale and Gratch, Jonathan},
  date = {2018},
  journaltitle = {Proceedings of the 18th International Conference on Intelligent Virtual Agents},
  pages = {275--280},
  doi = {10.1145/3267851.3267911},
  url = {https://dl.acm.org/citation.cfm?id=3267911},
  isbn = {978-1-4503-6013-5},
  keywords = {Emotion recognition,Facial mimicry,Virtual Humans},
  file = {/Users/ryedida/Zotero/storage/KIP5P63B/Hoegen et al. - 2018 - The impact of agent facial mimicry on social behavior in a prisoner's dilemma(2).pdf}
}

@article{Hoover2018,
  title = {Moral Framing and Charitable Donation: {{Integrating}} Exploratory Social Media Analyses and Confirmatory Experimentation},
  author = {Hoover, Joe and Johnson, Kate M and Boghrati, Reihane and Graham, Jesse and Dehghani, Morteza},
  date = {2018},
  journaltitle = {Collabra: Psychology},
  volume = {4},
  number = {1},
  pages = {9},
  issn = {2474-7394},
  doi = {10.1525/collabra.129},
  url = {https://www.collabra.org/article/10.1525/collabra.129/},
  abstract = {Do appeals to moral values promote charitable donation during natural disasters? Using Distributed Dictionary Representation, we analyze tweets posted during Hurricane Sandy to explore associations between moral values and charitable donation sentiment. We then derive hypotheses from the observed associations and test these hypotheses across a series of preregistered experiments that investigate the effects of moral framing on perceived donation motivation (Studies 2 \& 3), hypothetical donation (Study 4), and real donation behavior (Study 5). Overall, we find consistent positive associations between moral care and loyalty framing with donation sentiment and donation motivation. However, in contrast with people’s perceptions, we also find that moral frames may not actually have reliable effects on charitable donation, as measured by hypothetical indications of donation and real donation behavior. Overall, this work demonstrates that theoretically constrained, exploratory social media analyses can be used to generate viable hypotheses, but also that such approaches should be paired with rigorous controlled experiments.},
  keywords = {2012 with record-,breaking rainfall and 80-mile-per-hour,charitable donation,hit the atlantic coast,moral psychology,natural language processing,of the united states,on october 29th,social media,the 900-mile-wide hurricane sandy,winds,within 5},
  file = {/Users/ryedida/Zotero/storage/ZKDABX24/Hoover et al. - 2018 - Moral framing and charitable donation Integrating exploratory social media analyses and confirmatory experimen(2).pdf}
}

@article{hornik1989multilayer,
  title = {Multilayer Feedforward Networks Are Universal Approximators},
  author = {Hornik, Kurt and Stinchcombe, Maxwell and White, Halbert},
  date = {1989},
  journaltitle = {Neural networks},
  volume = {2},
  number = {5},
  pages = {359--366},
  publisher = {{Elsevier}}
}

@article{hornikApproximationCapabilitiesMultilayer1991,
  title = {Approximation Capabilities of Multilayer Feedforward Networks},
  author = {Hornik, Kurt},
  date = {1991},
  journaltitle = {Neural Networks},
  volume = {4},
  number = {2},
  eprint = {25246403},
  eprinttype = {pmid},
  pages = {251--257},
  issn = {08936080},
  doi = {10.1016/0893-6080(91)90009-T},
  abstract = {We show that standard multilayer feedforward networks with as few as a single hidden layer and arbitrary bounded and nonconstant activation function are universal approximators with respect to Lp(μ) performance criteria, for arbitrary finite input environment measures μ, provided only that sufficiently many hidden units are available. If the activation function is continuous, bounded and nonconstant, then continuous mappings can be learned uniformly over compact input sets. We also give very general conditions ensuring that networks with sufficiently smooth activation functions are capable of arbitrarily accurate approximation to a function and its derivatives. © 1991.},
  isbn = {0893-6080},
  keywords = {Activation function,Input environment measure,Lp(μ) approximation,Multilayer feedforward networks,Smooth approximation,Sobolev spaces,Uniform approximation,Universal approximation capabilities},
  file = {/Users/ryedida/Zotero/storage/REEI7EHV/Hornik - 1991 - Approximation capabilities of multilayer feedforward networks(2).pdf}
}

@article{hovemeyer2004finding,
  title = {Finding Bugs Is Easy},
  author = {Hovemeyer, David and Pugh, William},
  date = {2004},
  journaltitle = {Acm sigplan notices},
  volume = {39},
  number = {12},
  pages = {92--106},
  publisher = {{ACM New York, NY, USA}}
}

@unpublished{howard2018universal,
  title = {Universal Language Model Fine-Tuning for Text Classification},
  author = {Howard, Jeremy and Ruder, Sebastian},
  date = {2018},
  eprint = {1801.06146},
  eprinttype = {arxiv}
}

@unpublished{Hu2019,
  title = {Topology-{{Preserving Deep Image Segmentation}}},
  author = {Hu, Xiaoling and Fuxin, Li and Samaras, Dimitris and Chen, Chao},
  date = {2019},
  eprint = {1906.05404},
  eprinttype = {arxiv},
  pages = {1--11},
  url = {http://arxiv.org/abs/1906.05404},
  abstract = {Segmentation algorithms are prone to make topological errors on fine-scale structures, e.g., broken connections. We propose a novel method that learns to segment with correct topology. In particular, we design a continuous-valued loss function that enforces a segmentation to have the same topology as the ground truth, i.e., having the same Betti number. The proposed topology-preserving loss function is differentiable and we incorporate it into end-to-end training of a deep neural network. Our method achieves much better performance on the Betti number error, which directly accounts for the topological correctness. It also performs superiorly on other topology-relevant metrics, e.g., the Adjusted Rand Index and the Variation of Information. We illustrate the effectiveness of the proposed method on a broad spectrum of natural and biomedical datasets.}
}

@article{Huang2017,
  title = {Learner {{Modeling}} for {{Integration Skills}}},
  author = {Huang, Yun and Guerra-Hollstein, Julio and Barria-Pineda, Jordan and Brusilovsky, Peter},
  date = {2017},
  journaltitle = {Proceedings of the 25th Conference on User Modeling, Adaptation and Personalization  - UMAP '17},
  doi = {10.1145/3079628.3079677},
  abstract = {Complex skill mastery requires not only acquiring individual basic component skills, but also practicing integrating such basic skills. However, traditional approaches to knowledge modeling, such as Bayesian knowledge tracing, only trace knowledge of each decomposed basic component skill. This risks early assertion of mastery or ineffective remediation failing to address skill integration. We introduce a novel integration-level approach to model learners' knowledge and provide fine-grained diagnosis: a Bayesian network based on a new kind of knowledge graph with progressive integration skills. We assess the value of such a model from multifaceted aspects: performance prediction, parameter plausibility, expected instructional effectiveness, and real-world recommendation helpfulness. Our experiments based on a Java programming tutor show that proposed model significantly improves two popular multiple-skill knowledge tracing models on all these four aspects.},
  isbn = {9781450346351}
}

@article{Huang2017a,
  title = {Safety Verification of Deep Neural Networks},
  author = {Huang, Xiaowei and Kwiatkowska, Marta and Wang, Sen and Wu, Min},
  date = {2017},
  journaltitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  volume = {10426 LNCS},
  eprint = {1610.06940},
  eprinttype = {arxiv},
  pages = {3--29},
  issn = {16113349},
  doi = {10.1007/978-3-319-63387-9_1},
  abstract = {Deep neural networks have achieved impressive experimental results in image classification, but can surprisingly be unstable with respect to adversarial perturbations, that is, minimal changes to the input image that cause the network to misclassify it. With potential applications including perception modules and end-to-end controllers for self-driving cars, this raises concerns about their safety. We develop a novel automated verification framework for feed-forward multi-layer neural networks based on Satisfiability Modulo Theory (SMT). We focus on safety of image classification decisions with respect to image manipulations, such as scratches or changes to camera angle or lighting conditions that would result in the same class being assigned by a human, and define safety for an individual decision in terms of invariance of the classification within a small neighbourhood of the original image. We enable exhaustive search of the region by employing discretisation, and propagate the analysis layer by layer. Our method works directly with the network code and, in contrast to existing methods, can guarantee that adversarial examples, if they exist, are found for the given region and family of manipulations. If found, adversarial examples can be shown to human testers and/or used to fine-tune the network. We implement the techniques using Z3 and evaluate them on state-of-the-art networks, including regularised and deep learning networks. We also compare against existing techniques to search for adversarial examples},
  isbn = {9783319633862},
  file = {/Users/ryedida/Zotero/storage/BSPT5FAI/Huang et al. - 2017 - Safety verification of deep neural networks(2).pdf}
}

@inproceedings{huang2017densely,
  title = {Densely Connected Convolutional Networks},
  booktitle = {Proceedings of the {{IEEE}} Conference on Computer Vision and Pattern Recognition},
  author = {Huang, Gao and Liu, Zhuang and Van Der Maaten, Laurens and Weinberger, Kilian Q},
  date = {2017},
  pages = {4700--4708},
  file = {/Users/ryedida/Zotero/storage/69T2C9FZ/Huang et al. - 2017 - Densely connected convolutional networks(2).pdf}
}

@unpublished{huangAddressingLossMetricMismatch2019,
  title = {Addressing the {{Loss-Metric Mismatch}} with {{Adaptive Loss Alignment}}},
  author = {Huang, Chen and Zhai, Shuangfei and Talbott, Walter and Bautista, Miguel Angel and Sun, Shih-Yu and Guestrin, Carlos and Susskind, Josh},
  date = {2019-05-14},
  eprint = {1905.05895},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1905.05895},
  urldate = {2021-04-11},
  abstract = {In most machine learning training paradigms a fixed, often handcrafted, loss function is assumed to be a good proxy for an underlying evaluation metric. In this work we assess this assumption by meta-learning an adaptive loss function to directly optimize the evaluation metric. We propose a sample efficient reinforcement learning approach for adapting the loss dynamically during training. We empirically show how this formulation improves performance by simultaneously optimizing the evaluation metric and smoothing the loss landscape. We verify our method in metric learning and classification scenarios, showing considerable improvements over the state-of-the-art on a diverse set of tasks. Importantly, our method is applicable to a wide range of loss functions and evaluation metrics. Furthermore, the learned policies are transferable across tasks and data, demonstrating the versatility of the method.},
  langid = {english},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Statistics - Machine Learning},
  annotation = {19 citations (Semantic Scholar/arXiv) [2021-04-11]},
  file = {/Users/ryedida/Zotero/storage/GQZA8FU8/Huang et al. - 2019 - Addressing the Loss-Metric Mismatch with Adaptive .pdf}
}

@inproceedings{huangAPIMethodRecommendation2018,
  title = {{{API Method Recommendation}} without {{Worrying}} about the {{Task-API Knowledge Gap}}},
  booktitle = {2018 33rd {{IEEE}}/{{ACM International Conference}} on {{Automated Software Engineering}} ({{ASE}})},
  author = {Huang, Qiao and Xia, Xin and Xing, Zhenchang and Lo, David and Wang, Xinyu},
  date = {2018-09},
  pages = {293--304},
  issn = {2643-1572},
  doi = {10.1145/3238147.3238191},
  abstract = {Developers often need to search for appropriate APIs for their programming tasks. Although most libraries have API reference documentation, it is not easy to find appropriate APIs due to the lexical gap and knowledge gap between the natural language description of the programming task and the API description in API documentation. Here, the lexical gap refers to the fact that the same semantic meaning can be expressed by different words, and the knowledge gap refers to the fact that API documentation mainly describes API functionality and structure but lacks other types of information like concepts and purposes, which are usually the key information in the task description. In this paper, we propose an API recommendation approach named BIKER (Bi-Information source based KnowledgE Recommendation) to tackle these two gaps. To bridge the lexical gap, BIKER uses word embedding technique to calculate the similarity score between two text descriptions. Inspired by our survey findings that developers incorporate Stack Overflow posts and API documentation for bridging the knowledge gap, BIKER leverages Stack Overflow posts to extract candidate APIs for a program task, and ranks candidate APIs by considering the query's similarity with both Stack Overflow posts and API documentation. It also summarizes supplementary information (e.g., API description, code examples in Stack Overflow posts) for each API to help developers select the APIs that are most relevant to their tasks. Our evaluation with 413 API-related questions confirms the effectiveness of BIKER for both class- and method-level API recommendation, compared with state-of-the-art baselines. Our user study with 28 Java developers further demonstrates the practicality of BIKER for API search.},
  eventtitle = {2018 33rd {{IEEE}}/{{ACM International Conference}} on {{Automated Software Engineering}} ({{ASE}})},
  keywords = {API Documentation,API Recommendation,Stack Overflow,Word Embedding},
  annotation = {43 citations (Semantic Scholar/DOI) [2021-06-12]},
  file = {/Users/ryedida/Zotero/storage/3ZKL9Y9K/Huang et al. - 2018 - API Method Recommendation without Worrying about t.pdf;/Users/ryedida/Zotero/storage/AEUMEP8H/9000025.html}
}

@article{huangRetainingBeneficialInformation,
  title = {Retaining {{Beneficial Information}} from {{Detrimental Data}} for {{Deep Neural Network Repair}}},
  author = {Huang, Long-Kai and Zhao, Peilin and Huang, Junzhou and Pan, Sinno Jialin},
  abstract = {The performance of deep learning models heavily relies on the quality of the training data. Inadequacies in the training data, such as corrupt input or noisy labels, can lead to the failure of model generalization. Recent studies propose repairing the model by identifying the training samples that contribute to the failure and removing their influence from the model. However, it is important to note that the identified data may contain both beneficial and detrimental information. Simply erasing the information of the identified data from the model can have a negative impact on its performance, especially when accurate data is mistakenly identified as detrimental and removed. To overcome this challenge, we propose a novel approach that leverages the knowledge obtained from a retained clean set. Concretely, Our method first identifies harmful data by utilizing the clean set, then separates the beneficial and detrimental information within the identified data. Finally, we utilize the extracted beneficial information to enhance the model’s performance. Through empirical evaluations, we demonstrate that our method outperforms baseline approaches in both identifying harmful data and rectifying model failures. Particularly in scenarios where identification is challenging and a significant amount of benign data is involved, our method improves performance while the baselines deteriorate due to the erroneous removal of beneficial information.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/BWIIUGID/Huang et al. - Retaining Beneficial Information from Detrimental .pdf}
}

@article{huangRetainingBeneficialInformationa,
  title = {Retaining {{Beneficial Information}} from {{Detrimental Data}} for {{Deep Neural Network Repair}}},
  author = {Huang, Long-Kai and Zhao, Peilin and Huang, Junzhou and Pan, Sinno Jialin},
  abstract = {The performance of deep learning models heavily relies on the quality of the training data. Inadequacies in the training data, such as corrupt input or noisy labels, can lead to the failure of model generalization. Recent studies propose repairing the model by identifying the training samples that contribute to the failure and removing their influence from the model. However, it is important to note that the identified data may contain both beneficial and detrimental information. Simply erasing the information of the identified data from the model can have a negative impact on its performance, especially when accurate data is mistakenly identified as detrimental and removed. To overcome this challenge, we propose a novel approach that leverages the knowledge obtained from a retained clean set. Concretely, Our method first identifies harmful data by utilizing the clean set, then separates the beneficial and detrimental information within the identified data. Finally, we utilize the extracted beneficial information to enhance the model’s performance. Through empirical evaluations, we demonstrate that our method outperforms baseline approaches in both identifying harmful data and rectifying model failures. Particularly in scenarios where identification is challenging and a significant amount of benign data is involved, our method improves performance while the baselines deteriorate due to the erroneous removal of beneficial information.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/4RFC8QQ6/Huang et al. - Retaining Beneficial Information from Detrimental .pdf}
}

@article{huTopologyPreservingDeepImage,
  title = {Topology-{{Preserving Deep Image Segmentation}}},
  author = {Hu, Xiaoling and Li, Fuxin and Samaras, Dimitris and Chen, Chao},
  abstract = {Segmentation algorithms are prone to topological errors on fine-scale structures, e.g., broken connections. We propose a novel method that learns to segment with correct topology. In particular, we design a continuous-valued loss function that enforces a segmentation to have the same topology as the ground truth, i.e., having the same Betti number. The proposed topology-preserving loss function is differentiable and we incorporate it into end-to-end training of a deep neural network. Our method achieves much better performance on the Betti number error, which directly accounts for the topological correctness. It also performs superiorly on other topology-relevant metrics, e.g., the Adjusted Rand Index and the Variation of Information. We illustrate the effectiveness of the proposed method on a broad spectrum of natural and biomedical datasets.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/CGB6YPGR/Hu et al. - Topology-Preserving Deep Image Segmentation.pdf}
}

@book{hutterAutomatedMachineLearning2019,
  title = {Automated {{Machine Learning}}: {{Methods}}, {{Systems}}, {{Challenges}}},
  shorttitle = {Automated {{Machine Learning}}},
  editor = {Hutter, Frank and Kotthoff, Lars and Vanschoren, Joaquin},
  date = {2019},
  series = {The {{Springer Series}} on {{Challenges}} in {{Machine Learning}}},
  publisher = {{Springer International Publishing}},
  location = {{Cham}},
  doi = {10.1007/978-3-030-05318-5},
  url = {http://link.springer.com/10.1007/978-3-030-05318-5},
  urldate = {2024-01-22},
  isbn = {978-3-030-05317-8 978-3-030-05318-5},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/CJVLCT96/Hutter et al. - 2019 - Automated Machine Learning Methods, Systems, Chal.pdf}
}

@article{Ilievski2017,
  title = {Efficient Hyperparameter Optimization of Deep Learning Algorithms Using Deterministic {{RBF}} Surrogates},
  author = {Ilievski, Ilija and Akhtar, Taimoor and Feng, Jiashi and Shoemaker, Christine Annette},
  date = {2017},
  journaltitle = {31st AAAI Conference on Artificial Intelligence, AAAI 2017},
  pages = {822--829},
  abstract = {Automatically searching for optimal hyperparameter configurations is of crucial importance for applying deep learning algorithms in practice. Recently, Bayesian optimization has been proposed for optimizing hyperparameters of various machine learning algorithms. Those methods adopt probabilistic surrogate models like Gaussian processes to approximate and minimize the validation error function of hyperparameter values. However, probabilistic surrogates require accurate estimates of sufficient statistics (e.g., covariance) of the error distribution and thus need many function evaluations with a sizeable number of hyperparameters. This makes them inefficient for optimizing hyperparameters of deep learning algorithms, which are highly expensive to evaluate. In this work, we propose a new deterministic and efficient hyperparameter optimization method that employs radial basis functions as error surrogates. The proposed mixed integer algorithm, called HORD, searches the surrogate for the most promising hyperparameter values through dynamic coordinate search and requires many fewer function evaluations. HORD does well in low dimensions but it is exceptionally better in higher dimensions. Extensive evaluations on MNIST and CIFAR-10 for four deep neural networks demonstrate HORD significantly outperforms the well-established Bayesian optimization methods such as GP, SMAC, and TPE. For instance, on average, HORD is more than 6 times faster than GP-EI in obtaining the best configuration of 19 hyperparameters.},
  keywords = {Heuristic Search and Optimization},
  file = {/Users/ryedida/Zotero/storage/2FG7GLCM/Ilievski et al. - 2017 - Efficient hyperparameter optimization of deep learning algorithms using deterministic RBF surrogates(2).pdf}
}

@article{ilyasAdversarialExamplesAre,
  title = {Adversarial {{Examples}} Are Not {{Bugs}}, They Are {{Features}}},
  author = {Ilyas, Andrew and Engstrom, Logan and Santurkar, Shibani and Tran, Brandon and Tsipras, Dimitris and Ma, Aleksander},
  abstract = {Adversarial examples have attracted significant attention in machine learning, but the reasons for their existence and pervasiveness remain unclear. We demonstrate that adversarial examples can be directly attributed to the presence of non-robust features: features (derived from patterns in the data distribution) that are highly predictive, yet brittle and (thus) incomprehensible to humans. After capturing these features within a theoretical framework, we establish their widespread existence in standard datasets. Finally, we present a simple setting where we can rigorously tie the phenomena we observe in practice to a misalignment between the (human-specified) notion of robustness and the inherent geometry of the data.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/4XYYSRA5/Ilyas et al. - Adversarial Examples are not Bugs, they are Featur.pdf}
}

@unpublished{Ioffe,
  title = {Batch {{Normalization}}: {{Accelerating Deep Network Training}} by {{Reducing Internal Covariate Shift}}},
  author = {Ioffe, Sergey and Szegedy, Christian},
  eprint = {1502.03167v3},
  eprinttype = {arxiv},
  issn = {00222860},
  isbn = {9780874216561},
  file = {/Users/ryedida/Zotero/storage/XTCZXZMU/Ioffe, Szegedy - Unknown - Batch Normalization Accelerating Deep Network Training by Reducing Internal Covariate Shift(2).pdf}
}

@inproceedings{ishikawaHowEngineersPerceive2019,
  title = {How {{Do Engineers Perceive Difficulties}} in {{Engineering}} of {{Machine-Learning Systems}}? - {{Questionnaire Survey}}},
  booktitle = {Proceedings - 2019 {{IEEE}}/{{ACM Joint}} 7th {{International Workshop}} on {{Conducting Empirical Studies}} in {{Industry}} and 6th {{International Workshop}} on {{Software Engineering Research}} and {{Industrial Practice}}, {{CESSER-IP}} 2019},
  author = {Ishikawa, Fuyuki and Yoshioka, Nobukazu},
  date = {2019-05-01},
  pages = {2--9},
  publisher = {{Institute of Electrical and Electronics Engineers Inc.}},
  doi = {10.1109/CESSER-IP.2019.00009},
  abstract = {There is increasing interest in machine learning (ML) techniques and their applications in recent years. Although there has been intensive support by frameworks and libraries for the implementation of ML-based systems, investigation into engineering disciplines and methods is still at the early phase. The most pressing issue in this field is identifying the essential challenges for the software engineering research community as engineering of ML-based systems requires novel approaches due to the essentially different nature of ML-based systems. In this paper, we analyze the results of a questionnaire administered to 278 people who have worked on ML-based systems in practice, clarify the essential difficulties and their causes as perceived by practitioners, and suggest potential research directions.},
  isbn = {978-1-72812-264-9},
  keywords = {artificial intelligence,machine learning,questionnaire survey,software engineering}
}

@unpublished{izmailovAveragingWeightsLeads2019,
  title = {Averaging {{Weights Leads}} to {{Wider Optima}} and {{Better Generalization}}},
  author = {Izmailov, Pavel and Podoprikhin, Dmitrii and Garipov, Timur and Vetrov, Dmitry and Wilson, Andrew Gordon},
  date = {2019-02-25},
  eprint = {1803.05407},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1803.05407},
  urldate = {2021-04-12},
  abstract = {Deep neural networks are typically trained by optimizing a loss function with an SGD variant, in conjunction with a decaying learning rate, until convergence. We show that simple averaging of multiple points along the trajectory of SGD, with a cyclical or constant learning rate, leads to better generalization than conventional training. We also show that this Stochastic Weight Averaging (SWA) procedure finds much flatter solutions than SGD, and approximates the recent Fast Geometric Ensembling (FGE) approach with a single model. Using SWA we achieve notable improvement in test accuracy over conventional SGD training on a range of state-of-the-art residual networks, PyramidNets, DenseNets, and ShakeShake networks on CIFAR-10, CIFAR-100, and ImageNet. In short, SWA is extremely easy to implement, improves generalization, and has almost no computational overhead.},
  langid = {english},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Statistics - Machine Learning},
  annotation = {294 citations (Semantic Scholar/arXiv) [2021-04-12]},
  file = {/Users/ryedida/Zotero/storage/2CJKPSRD/Izmailov et al. - 2019 - Averaging Weights Leads to Wider Optima and Better.pdf}
}

@unpublished{jacotNeuralTangentKernel2020,
  title = {Neural {{Tangent Kernel}}: {{Convergence}} and {{Generalization}} in {{Neural Networks}}},
  shorttitle = {Neural {{Tangent Kernel}}},
  author = {Jacot, Arthur and Gabriel, Franck and Hongler, Clément},
  date = {2020-02-10},
  eprint = {1806.07572},
  eprinttype = {arxiv},
  eprintclass = {cs, math, stat},
  url = {http://arxiv.org/abs/1806.07572},
  urldate = {2022-01-01},
  abstract = {At initialization, artificial neural networks (ANNs) are equivalent to Gaussian processes in the infinite-width limit (16; 4; 7; 13; 6), thus connecting them to kernel methods. We prove that the evolution of an ANN during training can also be described by a kernel: during gradient descent on the parameters of an ANN, the network function fθ (which maps input vectors to output vectors) follows the kernel gradient of the functional cost (which is convex, in contrast to the parameter cost) w.r.t. a new kernel: the Neural Tangent Kernel (NTK). This kernel is central to describe the generalization features of ANNs. While the NTK is random at initialization and varies during training, in the infinite-width limit it converges to an explicit limiting kernel and it stays constant during training. This makes it possible to study the training of ANNs in function space instead of parameter space. Convergence of the training can then be related to the positive-definiteness of the limiting NTK. We prove the positive-definiteness of the limiting NTK when the data is supported on the sphere and the non-linearity is non-polynomial.},
  langid = {english},
  keywords = {Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing,Mathematics - Probability,Statistics - Machine Learning},
  annotation = {702 citations (Semantic Scholar/arXiv) [2022-01-01]},
  file = {/Users/ryedida/Zotero/storage/VISV48L4/Jacot et al. - 2020 - Neural Tangent Kernel Convergence and Generalizat.pdf}
}

@inproceedings{jamieson2016non,
  title = {Non-Stochastic Best Arm Identification and Hyperparameter Optimization},
  booktitle = {Artificial Intelligence and Statistics},
  author = {Jamieson, Kevin and Talwalkar, Ameet},
  date = {2016},
  pages = {240--248},
  publisher = {{PMLR}}
}

@article{jamiesonNonstochasticBestArm,
  title = {Non-Stochastic {{Best Arm Identiﬁcation}} and {{Hyperparameter Optimization}}},
  author = {Jamieson, Kevin and Talwalkar, Ameet},
  abstract = {Motivated by the task of hyperparameter optimization, we introduce the non-stochastic best-arm identification problem. We identify an attractive algorithm for this setting that makes no assumptions on the convergence behavior of the arms’ losses, has no free-parameters to adjust, provably outperforms the uniform allocation baseline in favorable conditions, and performs comparably (up to log factors) otherwise. Next, by leveraging the iterative nature of many learning algorithms, we cast hyperparameter optimization as an instance of non-stochastic best-arm identification. Our empirical results show that, by allocating more resources to promising hyperparameter settings, our approach achieves comparable test accuracies an order of magnitude faster than the uniform strategy. The robustness and simplicity of our approach makes it well-suited to ultimately replace the uniform strategy currently used in most machine learning software packages.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/FRW6AMQJ/Jamieson and Talwalkar - Non-stochastic Best Arm Identiﬁcation and Hyperpar.pdf}
}

@article{Jana2018,
  title = {{{DeepTest}}: {{Automated}} Testing of Deep-Neural-Network-Driven Autonomous Cars},
  author = {Jana, Suman and Tian, Yuchi and Pei, Kexin and Ray, Baishakhi},
  date = {2018},
  journaltitle = {Proceedings - International Conference on Software Engineering},
  volume = {2018-May},
  eprint = {1708.08559},
  eprinttype = {arxiv},
  pages = {303--314},
  issn = {02705257},
  doi = {10.1145/3180155.3180220},
  abstract = {Recent advances in Deep Neural Networks (DNNs) have led to the development of DNN-driven autonomous cars that, using sensors like camera, LiDAR, etc., can drive without any human intervention. Most major manufacturers including Tesla, GM, Ford, BMW, and Waymo/Google are working on building and testing different types of autonomous vehicles. The lawmakers of several US states including California, Texas, and New York have passed new legislation to fast-track the process of testing and deployment of autonomous vehicles on their roads. However, despite their spectacular progress, DNNs, just like traditional software, often demonstrate incorrect or unexpected corner-case behaviors that can lead to potentially fatal collisions. Several such real-world accidents involving autonomous cars have already happened including one which resulted in a fatality. Most existing testing techniques for DNN-driven vehicles are heavily dependent on the manual collection of test data under different driving conditions which become prohibitively expensive as the number of test conditions increases. In this paper, we design, implement, and evaluate DeepTest, a systematic testing tool for automatically detecting erroneous behaviors of DNN-driven vehicles that can potentially lead to fatal crashes. First, our tool is designed to automatically generated test cases leveraging real-world changes in driving conditions like rain, fog, lighting conditions, etc. DeepTest systematically explore different parts of the DNN logic by generating test inputs that maximize the numbers of activated neurons. DeepTest found thousands of erroneous behaviors under different realistic driving conditions (e.g., blurring, rain, fog, etc.) many of which lead to potentially fatal crashes in three top performing DNNs in the Udacity self-driving car challenge.},
  isbn = {9781450356633},
  keywords = {Autonomous vehicle,Deep learning,Deep neural networks,Neuron coverage,Self-driving cars,Testing},
  file = {/Users/ryedida/Zotero/storage/TSXGT2RY/Jana et al. - 2018 - DeepTest Automated testing of deep-neural-network-driven autonomous cars(2).pdf}
}

@article{Jannat2018,
  title = {Ubiquitous {{Emotion Recognition Using Audio}} and {{Video Data}}},
  author = {Jannat, Rahatul and Tynes, Iyonna and Lime, Lott La and Adorno, Juan and Canavan, Shaun J},
  date = {2018},
  journaltitle = {Proceedings of the 2018 \{ACM\} International Joint Conference and 2018 International Symposium on Pervasive and Ubiquitous Computing and Wearable Computers, UbiComp/ISWC 2018 Adjunct, Singapore, October 08-12, 2018},
  pages = {956--959},
  doi = {10.1145/3267305.3267689},
  url = {https://doi.org/10.1145/3267305.3267689},
  isbn = {9781450359665},
  file = {/Users/ryedida/Zotero/storage/ZIN2R397/Jannat et al. - 2018 - Ubiquitous Emotion Recognition Using Audio and Video Data(2).pdf}
}

@online{jastrzebskiThreeFactorsInfluencing2018,
  title = {Three {{Factors Influencing Minima}} in {{SGD}}},
  author = {Jastrzębski, Stanisław and Kenton, Zachary and Arpit, Devansh and Ballas, Nicolas and Fischer, Asja and Bengio, Yoshua and Storkey, Amos},
  date = {2018-09-13},
  eprint = {1711.04623},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1711.04623},
  urldate = {2023-12-12},
  abstract = {We investigate the dynamical and convergent properties of stochastic gradient descent (SGD) applied to Deep Neural Networks (DNNs). Characterizing the relation between learning rate, batch size and the properties of the final minima, such as width or generalization, remains an open question. In order to tackle this problem we investigate the previously proposed approximation of SGD by a stochastic differential equation (SDE). We theoretically argue that three factors - learning rate, batch size and gradient covariance - influence the minima found by SGD. In particular we find that the ratio of learning rate to batch size is a key determinant of SGD dynamics and of the width of the final minima, and that higher values of the ratio lead to wider minima and often better generalization. We confirm these findings experimentally. Further, we include experiments which show that learning rate schedules can be replaced with batch size schedules and that the ratio of learning rate to batch size is an important factor influencing the memorization process.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/N4CPGYW3/Jastrzębski et al. - 2018 - Three Factors Influencing Minima in SGD.pdf}
}

@article{Jegou2017,
  title = {The {{One Hundred Layers Tiramisu}}: {{Fully Convolutional DenseNets}} for {{Semantic Segmentation}}},
  author = {Jegou, Simon and Drozdzal, Michal and Vazquez, David and Romero, Adriana and Bengio, Yoshua},
  date = {2017},
  journaltitle = {IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops},
  volume = {2017-July},
  eprint = {19244017},
  eprinttype = {pmid},
  pages = {1175--1183},
  issn = {21607516},
  doi = {10.1109/CVPRW.2017.156},
  abstract = {State-of-the-art approaches for semantic image segmentation are built on Convolutional Neural Networks (CNNs). The typical segmentation architecture is composed of (a) a downsampling path responsible for extracting coarse semantic features, followed by (b) an upsampling path trained to recover the input image resolution at the output of the model and, optionally, (c) a post-processing module (e.g. Conditional Random Fields) to refine the model predictions. Recently, a new CNN architecture, Densely Connected Convolutional Networks (DenseNets), has shown excellent results on image classification tasks. The idea of DenseNets is based on the observation that if each layer is directly connected to every other layer in a feed-forward fashion then the network will be more accurate and easier to train. In this paper, we extend DenseNets to deal with the problem of semantic segmentation. We achieve state-of-the-art results on urban scene benchmark datasets such as CamVid and Gatech, without any further post-processing module nor pretraining. Moreover, due to smart construction of the model, our approach has much less parameters than currently published best entries for these datasets. Code to reproduce the experiments is available here : https://github.com/SimJeg/FC-DenseNet/blob/master/train.py},
  isbn = {9781538607336},
  file = {/Users/ryedida/Zotero/storage/B9HZR6HC/Jegou et al. - 2017 - The One Hundred Layers Tiramisu Fully Convolutional DenseNets for Semantic Segmentation(2).pdf}
}

@online{jiangFantasticGeneralizationMeasures2019,
  title = {Fantastic {{Generalization Measures}} and {{Where}} to {{Find Them}}},
  author = {Jiang, Yiding and Neyshabur, Behnam and Mobahi, Hossein and Krishnan, Dilip and Bengio, Samy},
  date = {2019-12-04},
  eprint = {1912.02178},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1912.02178},
  urldate = {2023-12-12},
  abstract = {Generalization of deep networks has been of great interest in recent years, resulting in a number of theoretically and empirically motivated complexity measures. However, most papers proposing such measures study only a small set of models, leaving open the question of whether the conclusion drawn from those experiments would remain valid in other settings. We present the first large scale study of generalization in deep networks. We investigate more then 40 complexity measures taken from both theoretical bounds and empirical studies. We train over 10,000 convolutional networks by systematically varying commonly used hyperparameters. Hoping to uncover potentially causal relationships between each measure and generalization, we analyze carefully controlled experiments and show surprising failures of some measures as well as promising measures for further research.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/H9JZHDDX/Jiang et al. - 2019 - Fantastic Generalization Measures and Where to Fin.pdf}
}

@article{jimmyDeepNetsReally,
  title = {Do {{Deep Nets Really Need}} to Be {{Deep}}?},
  author = {Jimmy, Lei and Caruana, Rich},
  abstract = {Currently, deep neural networks are the state of the art on problems such as speech recognition and computer vision. In this paper we empirically demonstrate that shallow feed-forward nets can learn the complex functions previously learned by deep nets and achieve accuracies previously only achievable with deep models. Moreover, in some cases the shallow nets can learn these deep functions using the same number of parameters as the original deep models. On the TIMIT phoneme recognition and CIFAR-10 image recognition tasks, shallow nets can be trained that perform similarly to complex, well-engineered, deeper convolutional models.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/Z3GZS7UJ/Jimmy and Caruana - Do Deep Nets Really Need to be Deep.pdf}
}

@unpublished{Jin2018,
  title = {Auto-{{Keras}}: {{An Efficient Neural Architecture Search System}}},
  author = {Jin, Haifeng and Song, Qingquan and Hu, Xia},
  date = {2018},
  eprint = {1806.10282},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1806.10282},
  abstract = {Neural architecture search (NAS) has been proposed to automatically tune deep neural networks, but existing search algorithms, e.g., NASNet, PNAS, usually suffer from expensive computational cost. Network morphism, which keeps the functionality of a neural network while changing its neural architecture, could be helpful for NAS by enabling more efficient training during the search. In this paper, we propose a novel framework enabling Bayesian optimization to guide the network morphism for efficient neural architecture search. The framework develops a neural network kernel and a tree-structured acquisition function optimization algorithm to efficiently explores the search space. Intensive experiments on real-world benchmark datasets have been done to demonstrate the superior performance of the developed framework over the state-of-the-art methods. Moreover, we build an open-source AutoML system based on our method, namely Auto-Keras. The system runs in parallel on CPU and GPU, with an adaptive search strategy for different GPU memory limits.},
  keywords = {automated machine learning,automl,bayesian optimization,network morphism,neural architecture search},
  file = {/Users/ryedida/Zotero/storage/QPR7DHFD/Jin, Song, Hu - 2018 - Auto-Keras An Efficient Neural Architecture Search System(2).pdf}
}

@book{johnduchiLectureNotesStatistics2023,
  title = {Lecture Notes on Statistics and Information Theory},
  author = {{John Duchi}},
  date = {2023},
  url = {https://web.stanford.edu/class/stats311/lecture-notes.pdf},
  file = {/Users/ryedida/Zotero/storage/P7KJQSIW/lecture-notes.pdf}
}

@article{johnsonAcceleratingStochasticGradient,
  title = {Accelerating {{Stochastic Gradient Descent}} Using {{Predictive Variance Reduction}}},
  author = {Johnson, Rie and Zhang, Tong},
  abstract = {Stochastic gradient descent is popular for large scale optimization but has slow convergence asymptotically due to the inherent variance. To remedy this problem, we introduce an explicit variance reduction method for stochastic gradient descent which we call stochastic variance reduced gradient (SVRG). For smooth and strongly convex functions, we prove that this method enjoys the same fast convergence rate as those of stochastic dual coordinate ascent (SDCA) and Stochastic Average Gradient (SAG). However, our analysis is significantly simpler and more intuitive. Moreover, unlike SDCA or SAG, our method does not require the storage of gradients, and thus is more easily applicable to complex problems such as some structured prediction problems and neural network learning.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/IUPGVIV9/Johnson and Zhang - Accelerating Stochastic Gradient Descent using Pre.pdf}
}

@article{jones1998efficient,
  title = {Efficient Global Optimization of Expensive Black-Box Functions},
  author = {Jones, Donald R and Schonlau, Matthias and Welch, William J},
  date = {1998},
  journaltitle = {Journal of Global optimization},
  volume = {13},
  number = {4},
  pages = {455},
  publisher = {{Springer}}
}

@article{Joseph2017,
  title = {Girls Rule, Boys Drool: {{Extracting}} Semantic and Affective Stereotypes on {{Twitter}}},
  author = {Joseph, Kenneth and Wei, Wei and Carley, Kathleen C.},
  date = {2017},
  journaltitle = {Proceedings of the 2017 ACM Conference on Computer Supported Cooperative Work and Social Computing - CSCW '17},
  pages = {1362--1374},
  doi = {10.1145/2998181.2998187},
  abstract = {Social identities carry widely agreed upon meanings, called stereo-types, that have important effects on social processes. We develop a method to extract the stereotypes of a particular population of Twit-ter users. Our model is grounded in social theory on stereotypes as both identities' affective meanings and their semantic relationships to each other. We apply our model to a dataset of 45K Twitter users who actively tweeted about the Michael Brown and Eric Garner tragedies. This case study furthers our understanding of both the stereotypes present for those who actively discussed these tragedies online as well as the structure of stereotypes in wider populations both online and off.},
  isbn = {9781450343350},
  keywords = {computational social science,identity,social psychology,stereo-},
  file = {/Users/ryedida/Zotero/storage/KPLZTDJC/Joseph, Wei, Carley - 2017 - Girls rule, boys drool Extracting semantic and affective stereotypes on Twitter(2).pdf}
}

@article{kaddour2022flat,
  title = {When Do Flat Minima Optimizers Work?},
  author = {Kaddour, Jean and Liu, Linqing and Silva, Ricardo and Kusner, Matt J},
  date = {2022},
  journaltitle = {Advances in Neural Information Processing Systems},
  volume = {35},
  pages = {16577--16595}
}

@article{kakadeDualityStrongConvexity,
  title = {On the Duality of Strong Convexity and Strong Smoothness: {{Learning}} Applications and Matrix Regularization},
  author = {Kakade, Sham M and Shalev-Shwartz, Shai and Tewari, Ambuj},
  abstract = {We show that a function is strongly convex with respect to some norm if and only if its conjugate function is strongly smooth with respect to the dual norm. This result has already been found to be a key component in deriving and analyzing several learning algorithms. Utilizing this duality, we isolate a single inequality which seamlessly implies both generalization bounds and online regret bounds; and we show how to construct strongly convex functions over matrices based on strongly convex functions over vectors. The newly constructed functions (over matrices) inherit the strong convexity properties of the underlying vector functions. We demonstrate the potential of this framework by analyzing several learning algorithms including group Lasso, kernel learning, and online control with adversarial quadratic costs.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/8IP49K2N/Kakade et al. - On the duality of strong convexity and strong smoo.pdf}
}

@unpublished{kang2022detecting,
  title = {Detecting False Alarms from Automatic Static Analysis Tools: {{How}} Far Are We?},
  author = {Kang, Hong Jin and Aw, Khai Loong and Lo, David},
  date = {2022},
  eprint = {2202.05982},
  eprinttype = {arxiv}
}

@unpublished{Karim2019,
  title = {Drug-{{Drug Interaction Prediction Based}} on {{Knowledge Graph Embeddings}} and {{Convolutional-LSTM Network}}},
  author = {Karim, Md. Rezaul and Cochez, Michael and Jares, Joao Bosco and Uddin, Mamtaz and Beyan, Oya and Decker, Stefan},
  date = {2019},
  eprint = {1908.01288},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1908.01288},
  abstract = {Interference between pharmacological substances can cause serious medical injuries. Correctly predicting so-called drug-drug interactions (DDI) does not only reduce these cases but can also result in a reduction of drug development cost. Presently, most drug-related knowledge is the result of clinical evaluations and post-marketing surveillance; resulting in a limited amount of information. Existing data-driven prediction approaches for DDIs typically rely on a single source of information, while using information from multiple sources would help improve predictions. Machine learning (ML) techniques are used, but the techniques are often unable to deal with skewness in the data. Hence, we propose a new ML approach for predicting DDIs based on multiple data sources. For this task, we use 12,000 drug features from DrugBank, PharmGKB, and KEGG drugs, which are integrated using Knowledge Graphs (KGs). To train our prediction model, we first embed the nodes in the graph using various embedding approaches. We found that the best performing combination was a ComplEx embedding method creating using PyTorch-BigGraph (PBG) with a Convolutional-LSTM network and classic machine learning-based prediction models. The model averaging ensemble method of three best classifiers yields up to 0.94, 0.92, 0.80 for AUPR, F1-score, and MCC, respectively during 5-fold cross-validation tests.},
  isbn = {9781450366663},
  keywords = {acm reference format,conv-lstm network,drug-drug interactions,embeddings,graph,knowledge graphs,linked data,model averaging ensemble}
}

@article{Katz2017,
  title = {Towards Proving the Adversarial Robustness of Deep Neural Networks},
  author = {Katz, Guy and Barrett, Clark and Dill, David L. and Julian, Kyle and Kochenderfer, Mykel J.},
  date = {2017},
  journaltitle = {Electronic Proceedings in Theoretical Computer Science, EPTCS},
  volume = {257},
  pages = {19--26},
  issn = {20752180},
  doi = {10.4204/EPTCS.257.3},
  abstract = {Autonomous vehicles are highly complex systems, required to function reliably in a wide variety of situations. Manually crafting software controllers for these vehicles is difficult, but there has been some success in using deep neural networks generated usingmachine-learning. However, deep neural networks are opaque to human engineers, rendering their correctness very difficult to provemanually; and existing automated techniques, which were not designed to operate on neural networks, fail to scale to large systems. This paper focuses on proving the adversarial robustness of deep neural networks, i.e. proving that small perturbations to a correctly-classified input to the network cannot cause it to be misclassified. We describe some of our recent and ongoing work on verifying the adversarial robustness of networks, and discuss some of the open questions we have encountered and how they might be addressed.},
  issue = {Fvav},
  file = {/Users/ryedida/Zotero/storage/HBD4JF6B/Katz et al. - 2017 - Towards proving the adversarial robustness of deep neural networks(2).pdf}
}

@article{Katz2017a,
  title = {Reluplex: {{An}} Efficient Smt Solver for Verifying Deep Neural Networks},
  author = {Katz, Guy and Barrett, Clark and Dill, David L. and Julian, Kyle and Kochenderfer, Mykel J.},
  date = {2017},
  journaltitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  volume = {10426 LNCS},
  eprint = {1702.01135},
  eprinttype = {arxiv},
  pages = {97--117},
  issn = {16113349},
  doi = {10.1007/978-3-319-63387-9_5},
  abstract = {Deep neural networks have emerged as a widely used and effective means for tackling complex, real-world problems. However, a major obstacle in applying them to safety-critical systems is the great difficulty in providing formal guarantees about their behavior. We present a novel, scalable, and efficient technique for verifying properties of deep neural networks (or providing counter-examples). The technique is based on the simplex method, extended to handle the non-convex Rectified Linear Unit (ReLU) activation function, which is a crucial ingredient in many modern neural networks. The verification procedure tackles neural networks as a whole, without making any simplifying assumptions. We evaluated our technique on a prototype deep neural network implementation of the next-generation airborne collision avoidance system for unmanned aircraft (ACAS Xu). Results show that our technique can successfully prove properties of networks that are an order of magnitude larger than the largest networks verified using existing methods.},
  isbn = {9783319633862}
}

@article{Katz2019,
  title = {The {{Marabou Framework}} for {{Verification}} and {{Analysis}} of {{Deep Neural Networks}}},
  author = {Katz, Guy and Huang, Derek A. and Ibeling, Duligur and Julian, Kyle and Lazarus, Christopher and Lim, Rachel and Shah, Parth and Thakoor, Shantanu and Wu, Haoze and Zeljić, Aleksandar and Dill, David L. and Kochenderfer, Mykel J. and Barrett, Clark},
  date = {2019},
  journaltitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  volume = {11561 LNCS},
  pages = {443--452},
  issn = {16113349},
  doi = {10.1007/978-3-030-25540-4_26},
  abstract = {Deep neural networks are revolutionizing the way complex systems are designed. Consequently, there is a pressing need for tools and techniques for network analysis and certification. To help in addressing that need, we present Marabou, a framework for verifying deep neural networks. Marabou is an SMT-based tool that can answer queries about a network’s properties by transforming these queries into constraint satisfaction problems. It can accommodate networks with different activation functions and topologies, and it performs high-level reasoning on the network that can curtail the search space and improve performance. It also supports parallel execution to further enhance scalability. Marabou accepts multiple input formats, including protocol buffer files generated by the popular TensorFlow framework for neural networks. We describe the system architecture and main components, evaluate the technique and discuss ongoing work.},
  isbn = {9783030255398},
  file = {/Users/ryedida/Zotero/storage/SSY6D89C/Katz et al. - 2019 - The Marabou Framework for Verification and Analysis of Deep Neural Networks(2).pdf}
}

@article{Kazak2019,
  title = {Verifying {{Deep-RL-Driven Systems}}},
  author = {Kazak, Yafim and Barrett, Clark and Katz, Guy and Schapira, Michael},
  date = {2019},
  isbn = {9781450368728},
  file = {/Users/ryedida/Zotero/storage/BVALNI6J/Kazak et al. - 2019 - Verifying Deep-RL-Driven Systems(2).pdf}
}

@online{keskarLargeBatchTrainingDeep2017,
  title = {On {{Large-Batch Training}} for {{Deep Learning}}: {{Generalization Gap}} and {{Sharp Minima}}},
  shorttitle = {On {{Large-Batch Training}} for {{Deep Learning}}},
  author = {Keskar, Nitish Shirish and Mudigere, Dheevatsa and Nocedal, Jorge and Smelyanskiy, Mikhail and Tang, Ping Tak Peter},
  date = {2017-02-09},
  eprint = {1609.04836},
  eprinttype = {arxiv},
  eprintclass = {cs, math},
  url = {http://arxiv.org/abs/1609.04836},
  urldate = {2023-11-25},
  abstract = {The stochastic gradient descent (SGD) method and its variants are algorithms of choice for many Deep Learning tasks. These methods operate in a small-batch regime wherein a fraction of the training data, say 32–512 data points, is sampled to compute an approximation to the gradient. It has been observed in practice that when using a larger batch there is a degradation in the quality of the model, as measured by its ability to generalize. We investigate the cause for this generalization drop in the large-batch regime and present numerical evidence that supports the view that large-batch methods tend to converge to sharp minimizers of the training and testing functions—and as is well known, sharp minima lead to poorer generalization. In contrast, small-batch methods consistently converge to flat minimizers, and our experiments support a commonly held view that this is due to the inherent noise in the gradient estimation. We discuss several strategies to attempt to help large-batch methods eliminate this generalization gap.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning,Mathematics - Optimization and Control},
  file = {/Users/ryedida/Zotero/storage/MN7TRNKL/Keskar et al. - 2017 - On Large-Batch Training for Deep Learning General.pdf}
}

@article{Khajah2014,
  title = {Maximizing {{Students}}' {{Retention}} via {{Spaced Review}}: {{Practical Guidance From Computational Models}} of {{Memory}}},
  author = {Khajah, Mohammad M and Lindsey, Robert V and Mozer, Michael C},
  date = {2014},
  journaltitle = {Topics in Cognitive Science},
  volume = {6},
  pages = {157--169},
  issn = {1756-8765},
  doi = {10.1111/tops.12077},
  abstract = {During each school semester, students face an onslaught of material to be learned. Students work hard to achieve initial mastery of the material, but when they move on, the newly learned facts, concepts, and skills degrade in memory. Although both students and educators appreciate that review can help stabilize learning, time constraints result in a trade-off between acquiring new knowledge and preserving old knowledge. To use time efficiently, when should review take place? Experimental studies have shown benefits to long-term retention with spaced study, but little practical advice is available to students and educators about the optimal spacing of study. The dearth of advice is due to the challenge of conducting experimental studies of learning in educational settings, especially where material is introduced in blocks over the time frame of a semester. In this study, we turn to two established models of memory-ACT-R and MCM-to conduct simulation studies exploring the impact of study schedule on long-term retention. Based on the premise of a fixed time each week to review, converging evidence from the two models suggests that an optimal review schedule obtains significant benefits over haphazard (suboptimal) review schedules. Furthermore, we identify two scheduling heuristics that obtain near optimal review performance: (a) review the material from l-weeks back, and (b) review material whose predicted memory strength is closest to a particular threshold. The former has implications for classroom instruction and the latter for the design of digital tutors.}
}

@article{Khajah2018,
  title = {Boosting Engagement with Educational Software Using near Wins},
  author = {Khajah, Mohammad M. and Mozer, Michael C. and Kelly, Sean and Milne, Brent},
  date = {2018},
  journaltitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  volume = {10948 LNAI},
  pages = {171--175},
  issn = {16113349},
  doi = {10.1007/978-3-319-93846-2_31},
  isbn = {9783319938455},
  keywords = {Anticipation,Educational applications,Near-win},
  file = {/Users/ryedida/Zotero/storage/MLFMAPRW/Khajah et al. - 2018 - Boosting engagement with educational software using near wins(2).pdf}
}

@inproceedings{kim2016emerging,
  title = {The Emerging Role of Data Scientists on Software Development Teams},
  booktitle = {Proceedings of the 38th {{International Conference}} on {{Software Engineering}}},
  author = {Kim, Miryung and Zimmermann, Thomas and DeLine, Robert and Begel, Andrew},
  date = {2016},
  pages = {96--107},
  publisher = {{ACM}}
}

@article{Kim2019,
  title = {Guiding {{Deep Learning System Testing Using Surprise Adequacy}}},
  author = {Kim, Jinhan and Feldt, Robert and Yoo, Shin},
  date = {2019},
  journaltitle = {Proceedings - International Conference on Software Engineering},
  volume = {2019-May},
  eprint = {1808.08444},
  eprinttype = {arxiv},
  pages = {1039--1049},
  publisher = {{IEEE}},
  issn = {02705257},
  doi = {10.1109/ICSE.2019.00108},
  abstract = {Deep Learning (DL) systems are rapidly being adopted in safety and security critical domains, urgently calling for ways to test their correctness and robustness. Testing of DL systems has traditionally relied on manual collection and labelling of data. Recently, a number of coverage criteria based on neuron activation values have been proposed. These criteria essentially count the number of neurons whose activation during the execution of a DL system satisfied certain properties, such as being above predefined thresholds. However, existing coverage criteria are not sufficiently fine grained to capture subtle behaviours exhibited by DL systems. Moreover, evaluations have focused on showing correlation between adversarial examples and proposed criteria rather than evaluating and guiding their use for actual testing of DL systems. We propose a novel test adequacy criterion for testing of DL systems, called Surprise Adequacy for Deep Learning Systems (SADL), which is based on the behaviour of DL systems with respect to their training data. We measure the surprise of an input as the difference in DL system's behaviour between the input and the training data (i.e., what was learnt during training), and subsequently develop this as an adequacy criterion: a good test input should be sufficiently but not overtly surprising compared to training data. Empirical evaluation using a range of DL systems from simple image classifiers to autonomous driving car platforms shows that systematic sampling of inputs based on their surprise can improve classification accuracy of DL systems against adversarial examples by up to 77.5\% via retraining.},
  isbn = {9781728108698},
  keywords = {Coverage Criteria,Deep Learning Systems,Test Adequacy},
  file = {/Users/ryedida/Zotero/storage/3QC53P68/Kim, Feldt, Yoo - 2019 - Guiding Deep Learning System Testing Using Surprise Adequacy(2).pdf}
}

@unpublished{kim2023repairing,
  title = {Repairing {{DNN}} Architecture: {{Are}} We There Yet?},
  author = {Kim, Jinhan and Humbatova, Nargiz and Jahangirova, Gunel and Tonella, Paolo and Yoo, Shin},
  date = {2023},
  eprint = {2301.11568},
  eprinttype = {arxiv}
}

@online{kingmaAdamMethodStochastic2017,
  title = {Adam: {{A Method}} for {{Stochastic Optimization}}},
  shorttitle = {Adam},
  author = {Kingma, Diederik P. and Ba, Jimmy},
  date = {2017-01-29},
  eprint = {1412.6980},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/1412.6980},
  urldate = {2023-12-09},
  abstract = {We introduce Adam, an algorithm for first-order gradient-based optimization of stochastic objective functions, based on adaptive estimates of lower-order moments. The method is straightforward to implement, is computationally efficient, has little memory requirements, is invariant to diagonal rescaling of the gradients, and is well suited for problems that are large in terms of data and/or parameters. The method is also appropriate for non-stationary objectives and problems with very noisy and/or sparse gradients. The hyper-parameters have intuitive interpretations and typically require little tuning. Some connections to related algorithms, on which Adam was inspired, are discussed. We also analyze the theoretical convergence properties of the algorithm and provide a regret bound on the convergence rate that is comparable to the best known results under the online convex optimization framework. Empirical results demonstrate that Adam works well in practice and compares favorably to other stochastic optimization methods. Finally, we discuss AdaMax, a variant of Adam based on the infinity norm.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/DBINP7CN/Kingma and Ba - 2017 - Adam A Method for Stochastic Optimization.pdf}
}

@unpublished{Klambauer2017,
  title = {Self-{{Normalizing Neural Networks}}},
  author = {Klambauer, Günter and Unterthiner, Thomas and Mayr, Andreas and Hochreiter, Sepp},
  date = {2017},
  eprint = {197263},
  eprinttype = {pmid},
  issn = {0022-538X},
  doi = {1706.02515},
  url = {http://arxiv.org/abs/1706.02515},
  abstract = {Deep Learning has revolutionized vision via convolutional neural networks (CNNs) and natural language processing via recurrent neural networks (RNNs). However, success stories of Deep Learning with standard feed-forward neural networks (FNNs) are rare. FNNs that perform well are typically shallow and, therefore cannot exploit many levels of abstract representations. We introduce self-normalizing neural networks (SNNs) to enable high-level abstract representations. While batch normalization requires explicit normalization, neuron activations of SNNs automatically converge towards zero mean and unit variance. The activation function of SNNs are "scaled exponential linear units" (SELUs), which induce self-normalizing properties. Using the Banach fixed-point theorem, we prove that activations close to zero mean and unit variance that are propagated through many network layers will converge towards zero mean and unit variance -- even under the presence of noise and perturbations. This convergence property of SNNs allows to (1) train deep networks with many layers, (2) employ strong regularization, and (3) to make learning highly robust. Furthermore, for activations not close to unit variance, we prove an upper and lower bound on the variance, thus, vanishing and exploding gradients are impossible. We compared SNNs on (a) 121 tasks from the UCI machine learning repository, on (b) drug discovery benchmarks, and on (c) astronomy tasks with standard FNNs and other machine learning methods such as random forests and support vector machines. SNNs significantly outperformed all competing FNN methods at 121 UCI tasks, outperformed all competing methods at the Tox21 dataset, and set a new record at an astronomy data set. The winning SNN architectures are often very deep. Implementations are available at: github.com/bioinf-jku/SNNs.},
  isbn = {9781538604571},
  file = {/Users/ryedida/Zotero/storage/4HP7SL9Y/Klambauer et al. - 2017 - Self-Normalizing Neural Networks(2).pdf}
}

@article{Klein2017,
  title = {Fast {{Bayesian}} Optimization of Machine Learning Hyperparameters on Large Datasets},
  author = {Klein, Aaron and Falkner, Stefan and Bartels, Simon and Hennig, Philipp and Hutter, Frank},
  date = {2017},
  journaltitle = {Proceedings of the 20th International Conference on Artificial Intelligence and Statistics, AISTATS 2017},
  volume = {54},
  eprint = {1605.07079v2},
  eprinttype = {arxiv},
  abstract = {Bayesian optimization has become a successful tool for hyperparameter optimization of machine learning algorithms, such as support vector machines or deep neural networks. Despite its success, for large datasets, training and validating a single configuration often takes hours, days, or even weeks, which limits the achievable performance. To accelerate hyperparameter optimization, we propose a generative model for the validation error as a function of training set size, which is learned during the optimization process and allows exploration of preliminary configurations on small subsets, by extrapolating to the full dataset. We construct a Bayesian optimization procedure, dubbed Fabolas, which models loss and training time as a function of dataset size and automatically trades off high information gain about the global optimum against computational cost. Experiments optimizing support vector machines and deep neural networks show that Fabolas often finds high-quality solutions 10 to 100 times faster than other state-of-the-art Bayesian optimization methods or the recently proposed bandit strategy Hyperband.},
  file = {/Users/ryedida/Zotero/storage/FQC9WDYA/Klein et al. - 2017 - Fast Bayesian optimization of machine learning hyperparameters on large datasets(2).pdf}
}

@article{kobakInitializationCriticalPreserving2021,
  title = {Initialization Is Critical for Preserving Global Data Structure in Both T-{{SNE}} and {{UMAP}}},
  author = {Kobak, Dmitry and Linderman, George C.},
  date = {2021-02},
  journaltitle = {Nature Biotechnology},
  shortjournal = {Nat Biotechnol},
  volume = {39},
  number = {2},
  pages = {156--157},
  issn = {1087-0156, 1546-1696},
  doi = {10.1038/s41587-020-00809-z},
  url = {http://www.nature.com/articles/s41587-020-00809-z},
  urldate = {2021-04-11},
  langid = {english},
  annotation = {2 citations (Semantic Scholar/DOI) [2021-04-11]},
  file = {/Users/ryedida/Zotero/storage/X3P8E5UQ/Kobak and Linderman - 2021 - Initialization is critical for preserving global d.pdf}
}

@report{kobakUMAPDoesNot2019,
  type = {preprint},
  title = {{{UMAP}} Does Not Preserve Global Structure Any Better than T-{{SNE}} When Using the Same Initialization},
  author = {Kobak, Dmitry and Linderman, George C.},
  date = {2019-12-19},
  institution = {{Bioinformatics}},
  doi = {10.1101/2019.12.19.877522},
  url = {http://biorxiv.org/lookup/doi/10.1101/2019.12.19.877522},
  urldate = {2021-04-11},
  abstract = {One of the most ubiquitous analysis tools employed in single-cell transcriptomics and cytometry is t-distributed stochastic neighbor embedding (t-SNE) [1], used to visualize individual cells as points on a 2D scatter plot such that similar cells are positioned close together. Recently, a related algorithm, called uniform manifold approximation and projection (UMAP) [2] has attracted substantial attention in the single-cell community. In Nature Biotechnology, Becht et al. [3] argued that UMAP is preferable to t-SNE because it better preserves the global structure of the data and is more consistent across runs. Here we show that this alleged superiority of UMAP can be entirely attributed to different choices of initialization in the implementations used by Becht et al.: t-SNE implementations by default used random initialization, while the UMAP implementation used a technique called Laplacian eigenmaps [4] to initialize the embedding. We show that UMAP with random initialization preserves global structure as poorly as t-SNE with random initialization, while t-SNE with informative initialization performs as well as UMAP with informative initialization. Hence, contrary to the claims of Becht et al., their experiments do not demonstrate any advantage of the UMAP algorithm per se, but rather warn against using random initialization.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/S7ALUXR6/Kobak and Linderman - 2019 - UMAP does not preserve global structure any better.pdf}
}

@article{koenig2011absolute,
  title = {The Absolute Threshold of Cone Vision},
  author = {Koenig, Darren and Hofer, Heidi},
  date = {2011},
  journaltitle = {Journal of vision},
  volume = {11},
  number = {1},
  pages = {21},
  publisher = {{The Association for Research in Vision and Ophthalmology}}
}

@article{Kontschieder2016,
  title = {Deep Neural Decision Forests},
  author = {Kontschieder, Peter and Fiterau, Madalina and Criminisi, Antonio and Bulò, Samuel Rota},
  date = {2016},
  journaltitle = {IJCAI International Joint Conference on Artificial Intelligence},
  volume = {2016-Janua},
  pages = {4190--4194},
  issn = {10450823},
  doi = {10.1109/ICCV.2015.172},
  abstract = {We present a novel approach to enrich classification trees with the representation learning ability of deep (neural) networks within an end-to-end trainable architecture. We combine these two worlds via a stochastic and differentiable decision tree model, which steers the formation of latent representations within the hidden layers of a deep network. The proposed model differs from conventional deep networks in that a decision forest provides the final predictions and it differs from conventional decision forests by introducing a principled, joint and global optimization of split and leaf node parameters. Our approach compares favourably to other state-of-the-art deep models on a large-scale image classification task like ImageNet.},
  isbn = {9781467383912},
  file = {/Users/ryedida/Zotero/storage/VGUIBX3I/Kontschieder et al. - 2016 - Deep neural decision forests(2).pdf}
}

@article{krystalKetamineParadigmShift2019,
  title = {Ketamine: {{A Paradigm Shift}} for {{Depression Research}} and {{Treatment}}},
  shorttitle = {Ketamine},
  author = {Krystal, John H. and Abdallah, Chadi G. and Sanacora, Gerard and Charney, Dennis S. and Duman, Ronald S.},
  date = {2019-03},
  journaltitle = {Neuron},
  shortjournal = {Neuron},
  volume = {101},
  number = {5},
  pages = {774--778},
  issn = {08966273},
  doi = {10.1016/j.neuron.2019.02.005},
  url = {https://linkinghub.elsevier.com/retrieve/pii/S089662731930114X},
  urldate = {2023-09-21},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/VUVBSTP9/Krystal et al. - 2019 - Ketamine A Paradigm Shift for Depression Research.pdf}
}

@article{kumaraswamyGeneralizedProbabilityDensity1980,
  title = {A Generalized Probability Density Function for Double-Bounded Random Processes},
  author = {Kumaraswamy, P.},
  date = {1980-03-01},
  journaltitle = {Journal of Hydrology},
  shortjournal = {Journal of Hydrology},
  volume = {46},
  number = {1},
  pages = {79--88},
  issn = {0022-1694},
  doi = {10.1016/0022-1694(80)90036-0},
  url = {https://www.sciencedirect.com/science/article/pii/0022169480900360},
  urldate = {2024-01-27},
  abstract = {The author developed in 1976 the sinepower probability density function (SP-PDF) to fit up random processes which are bounded at the lower and upper ends, and which has a mode occurring between these two bounds. This latter condition is now relaxed and a generalized PDF entitled double bounded probability density function (DB-PDF) is developed here. Methods for the application to practical problems of parameter estimation and to computer simulation of random variables are explained by a numerical example.},
  file = {/Users/ryedida/Zotero/storage/2I4YXP6Q/Kumaraswamy_1980_A generalized probability density function for double-bounded random processes.pdf;/Users/ryedida/Zotero/storage/IU5WLPUA/0022169480900360.html}
}

@article{kumarDeepNeuralNetwork2021,
  title = {Deep Neural Network Hyper-Parameter Tuning through Twofold Genetic Approach},
  author = {Kumar, Puneet and Batra, Shalini and Raman, Balasubramanian},
  date = {2021-04-18},
  journaltitle = {Soft Computing},
  shortjournal = {Soft Comput},
  issn = {1432-7643, 1433-7479},
  doi = {10.1007/s00500-021-05770-w},
  url = {https://link.springer.com/10.1007/s00500-021-05770-w},
  urldate = {2021-04-22},
  abstract = {In this paper, traditional and meta-heuristic approaches for optimizing deep neural networks (DNN) have been surveyed, and a genetic algorithm (GA)-based approach involving two optimization phases for hyper-parameter discovery and optimal data subset determination has been proposed. The first phase aims to quickly select an optimal combination of the network hyper-parameters to design a DNN. Compared to the traditional grid-search-based method, the optimal parameters have been computed 6.5 times faster for recurrent neural network (RNN) and 8 times faster for convolutional neural network (CNN). The proposed approach is capable of tuning multiple hyper-parameters simultaneously. The second phase finds an appropriate subset of the training data for near-optimal prediction performance, providing an additional speedup of 75.86\% for RNN and 41.12\% for CNN over the first phase.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/XQCLNYY4/Kumar et al. - 2021 - Deep neural network hyper-parameter tuning through.pdf}
}

@article{Kunapuli2013,
  title = {Guiding Autonomous Agents to Better Behaviors through Human Advice},
  author = {Kunapuli, Gautam and Odom, Phillip and Shavlik, Jude W. and Natarajan, Sriraam},
  date = {2013},
  journaltitle = {Proceedings - IEEE International Conference on Data Mining, ICDM},
  pages = {409--418},
  issn = {15504786},
  doi = {10.1109/ICDM.2013.79},
  abstract = {Inverse Reinforcement Learning (IRL) is an approach for domain-reward discovery from demonstration, where an agent mines the reward function of a Markov decision process by observing an expert acting in the domain. In the standard setting, it is assumed that the expert acts (nearly) optimally, and a large number of trajectories, i.e., training examples are available for reward discovery (and consequently, learning domain behavior). These are not practical assumptions: trajectories are often noisy, and there can be a paucity of examples. Our novel approach incorporates advice-giving into the IRL framework to address these issues. Inspired by preference elicitation, a domain expert provides advice on states and actions (features) by stating preferences over them. We evaluate our approach on several domains and show that with small amounts of targeted preference advice, learning is possible from noisy demonstrations, and requires far fewer trajectories compared to simply learning from trajectories alone.},
  isbn = {978-0-7695-5108-1}
}

@article{kunstnerLimitationsEmpiricalFisher,
  title = {Limitations of the {{Empirical Fisher Approximation}} for {{Natural Gradient Descent}}},
  author = {Kunstner, Frederik and Balles, Lukas and Hennig, Philipp},
  abstract = {Natural gradient descent, which preconditions a gradient descent update with the Fisher information matrix of the underlying statistical model, is a way to capture partial second-order information. Several highly visible works have advocated an approximation known as the empirical Fisher, drawing connections between approximate second-order methods and heuristics like Adam. We dispute this argument by showing that the empirical Fisher—unlike the Fisher—does not generally capture second-order information. We further argue that the conditions under which the empirical Fisher approaches the Fisher (and the Hessian) are unlikely to be met in practice, and that, even on simple optimization problems, the pathologies of the empirical Fisher can have undesirable effects.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/VPLTIS9T/Kunstner et al. - Limitations of the Empirical Fisher Approximation .pdf}
}

@article{Kurakin2019,
  title = {Adversarial Examples in the Physical World},
  author = {Kurakin, Alexey and Goodfellow, Ian J. and Bengio, Samy},
  date = {2019},
  journaltitle = {5th International Conference on Learning Representations, ICLR 2017 - Workshop Track Proceedings},
  number = {c},
  eprint = {1607.02533},
  eprinttype = {arxiv},
  pages = {1--14},
  abstract = {Most existing machine learning classifiers are highly vulnerable to adversarial examples. An adversarial example is a sample of input data which has been modified very slightly in a way that is intended to cause a machine learning classifier to misclassify it. In many cases, these modifications can be so subtle that a human observer does not even notice the modification at all, yet the classifier still makes a mistake. Adversarial examples pose security concerns because they could be used to perform an attack on machine learning systems, even if the adversary has no access to the underlying model. Up to now, all previous work has assumed a threat model in which the adversary can feed data directly into the machine learning classifier. This is not always the case for systems operating in the physical world, for example those which are using signals from cameras and other sensors as input. This paper shows that even in such physical world scenarios, machine learning systems are vulnerable to adversarial examples. We demonstrate this by feeding adversarial images obtained from a cell-phone camera to an ImageNet Inception classifier and measuring the classification accuracy of the system. We find that a large fraction of adversarial examples are classified incorrectly even when perceived through the camera.},
  file = {/Users/ryedida/Zotero/storage/JRK2CK3N/Kurakin, Goodfellow, Bengio - 2019 - Adversarial examples in the physical world(2).pdf}
}

@online{kuzborskijEfronSteinPACBayesianInequalities2020,
  title = {Efron-{{Stein PAC-Bayesian Inequalities}}},
  author = {Kuzborskij, Ilja and Szepesvári, Csaba},
  date = {2020-02-03},
  eprint = {1909.01931},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1909.01931},
  urldate = {2024-01-16},
  abstract = {We prove semi-empirical concentration inequalities for random variables which are given as possibly nonlinear functions of independent random variables. These inequalities describe concentration of random variable in terms of the data/distribution-dependent Efron-Stein (ES) estimate of its variance and they do not require any additional assumptions on the moments. In particular, this allows us to state semiempirical Bernstein type inequalities for general functions of unbounded random variables, which gives user-friendly concentration bounds for cases where related methods (e.g. bounded differences) might be more challenging to apply. We extend these results to Efron-Stein PAC-Bayesian inequalities which hold for arbitrary probability kernels that define a random, data-dependent choice of the function of interest. Finally, we demonstrate a number of applications, including PAC-Bayesian generalization bounds for unbounded loss functions, empirical Bernstein type generalization bounds, new truncation-free bounds for off-policy evaluation with Weighted Importance Sampling (WIS), and off-policy PAC-Bayesian learning with WIS.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/C2C7WCDX/Kuzborskij and Szepesvári - 2020 - Efron-Stein PAC-Bayesian Inequalities.pdf}
}

@article{lai1985asymptotically,
  title = {Asymptotically Efficient Adaptive Allocation Rules},
  author = {Lai, Tze Leung and Robbins, Herbert and others},
  date = {1985},
  journaltitle = {Advances in applied mathematics},
  volume = {6},
  number = {1},
  pages = {4--22}
}

@article{Lakkaraju2017,
  title = {Identifying Unknown Unknowns in the Open World: {{Representations}} and Policies for Guided Exploration},
  author = {Lakkaraju, Himabindu and Kamar, Ece and Caruana, Rich and Horvitz, Eric},
  date = {2017},
  journaltitle = {31st AAAI Conference on Artificial Intelligence, AAAI 2017},
  eprint = {1610.09064},
  eprinttype = {arxiv},
  pages = {2124--2132},
  abstract = {Predictive models deployed in the real world may assign incorrect labels to instances with high confidence. Such errors or unknown unknowns are rooted in model incompleteness, and typically arise because of the mismatch between training data and the cases encountered at test time. As the models are blind to such errors, input from an oracle is needed to identify these failures. In this paper, we formulate and address the problem of informed discovery of unknown unknowns of any given predictive model where unknown unknowns occur due to systematic biases in the training data. We propose a model-agnostic methodology which uses feedback from an oracle to both identify unknown unknowns and to intelligently guide the discovery. We employ a two-phase approach which first organizes the data into multiple partitions based on the feature similarity of instances and the confidence scores assigned by the predictive model, and then utilizes an explore-exploit strategy for discovering unknown unknowns across these partitions. We demonstrate the efficacy of our framework by varying the underlying causes of unknown unknowns across various applications. To the best of our knowledge, this paper presents the first algorithmic approach to the problem of discovering unknown unknowns of predictive models.},
  issue = {Settles 2009},
  file = {/Users/ryedida/Zotero/storage/3VTLHVIB/Lakkaraju et al. - 2017 - Identifying unknown unknowns in the open world Representations and policies for guided exploration(2).pdf}
}

@inproceedings{last2003data,
  title = {The Data Mining Approach to Automated Software Testing},
  booktitle = {Proceedings of the Ninth {{ACM SIGKDD}} International Conference on {{Knowledge}} Discovery and Data Mining},
  author = {Last, Mark and Friedman, Menahem and Kandel, Abraham},
  date = {2003},
  pages = {388--396},
  publisher = {{ACM}}
}

@inproceedings{lavazzaReliabilityAreaROC2023,
  title = {On the {{Reliability}} of the {{Area Under}} the {{ROC Curve}} in {{Empirical Software Engineering}}},
  booktitle = {Proceedings of the 27th {{International Conference}} on {{Evaluation}} and {{Assessment}} in {{Software Engineering}}},
  author = {Lavazza, Luigi and Morasca, Sandro and Rotoloni, Gabriele},
  date = {2023-06-14},
  series = {{{EASE}} '23},
  pages = {93--100},
  publisher = {{Association for Computing Machinery}},
  location = {{New York, NY, USA}},
  doi = {10.1145/3593434.3593456},
  url = {https://dl.acm.org/doi/10.1145/3593434.3593456},
  urldate = {2023-10-05},
  abstract = {Binary classifiers are commonly used in software engineering research to estimate several software qualities, e.g., defectiveness or vulnerability. Thus, it is important to adequately evaluate how well binary classifiers perform, before they are used in practice. The Area Under the Curve (AUC) of Receiver Operating Characteristic curves has often been used to this end. However, AUC has been the target of some criticisms, so it is necessary to evaluate under what conditions and to what extent AUC can be a reliable performance metric. We analyze AUC in relation to ϕ (also known as Matthews Correlation Coefficient), often considered a more reliable performance metric, by building the lines in the ROC space with constant value of ϕ, for several values of ϕ, and computing the corresponding values of AUC. By their very definitions, AUC and ϕ depend on the prevalence ρ of a dataset, which is the proportion of its positive instances (e.g., the defective software modules). Hence, so does the relationship between AUC and ϕ. It turns out that AUC and ϕ are very well correlated, and therefore provide concordant indications, for balanced datasets (those with ρ ≃ 0.5). Instead, AUC tends to become quite large, and hence provide over-optimistic indications, for very imbalanced datasets (those with ρ ≃ 0 or ρ ≃ 1). We use examples from the software engineering literature to illustrate the analytical relationship linking AUC, ϕ, and ρ. We show that, for some values of ρ, the evaluation of performance based exclusively on AUC can be deceiving. In conclusion, this paper provides some guidelines for an informed usage and interpretation of AUC.},
  isbn = {9798400700446},
  keywords = {accuracy,Binary classifiers,Matthews Correlation Coefficient.,Pearson ϕ,performance metrics,predictors},
  file = {/Users/ryedida/Zotero/storage/6NAQD7YV/Lavazza et al_2023_On the Reliability of the Area Under the ROC Curve in Empirical Software.pdf}
}

@unpublished{Lazaridou2016,
  title = {The Red One!: {{On}} Learning to Refer to Things Based on Their Discriminative Properties},
  author = {Lazaridou, Angeliki and Pham, Nghia The and Baroni, Marco},
  date = {2016},
  eprint = {1603.02618},
  eprinttype = {arxiv},
  issn = {15353699},
  doi = {10.3181/00379727-165-40995},
  abstract = {As a first step towards agents learning to communicate about their visual environment, we propose a system that, given visual representations of a referent (cat) and a context (sofa), identifies their discriminative attributes, i.e., properties that distinguish them (has\_tail). Moreover, despite the lack of direct supervision at the attribute level, the model learns to assign plausible attributes to objects (sofa-has\_cushion). Finally, we present a preliminary experiment confirming the referential success of the predicted discriminative attributes.},
  isbn = {9781510827592}
}

@article{lecun2015deep,
  title = {Deep Learning},
  author = {LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey},
  date = {2015},
  journaltitle = {nature},
  volume = {521},
  number = {7553},
  pages = {436--444},
  publisher = {{Nature Publishing Group}}
}

@article{lecunDeepLearning2015,
  title = {Deep Learning},
  author = {LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey},
  date = {2015-05},
  journaltitle = {Nature},
  volume = {521},
  number = {7553},
  pages = {436--444},
  publisher = {{Nature Publishing Group}},
  issn = {1476-4687},
  doi = {10.1038/nature14539},
  url = {https://www.nature.com/articles/nature14539},
  urldate = {2023-11-18},
  abstract = {Deep learning allows computational models that are composed of multiple processing layers to learn representations of data with multiple levels of abstraction. These methods have dramatically improved the state-of-the-art in speech recognition, visual object recognition, object detection and many other domains such as drug discovery and genomics. Deep learning discovers intricate structure in large data sets by using the backpropagation algorithm to indicate how a machine should change its internal parameters that are used to compute the representation in each layer from the representation in the previous layer. Deep convolutional nets have brought about breakthroughs in processing images, video, speech and audio, whereas recurrent nets have shone light on sequential data such as text and speech.},
  issue = {7553},
  langid = {english},
  keywords = {Computer science,Mathematics and computing},
  file = {/Users/ryedida/Zotero/storage/XHX6UDLZ/LeCun et al_2015_Deep learning.pdf}
}

@article{Lee2016,
  title = {Deep Saliency with Encoded Low Level Distance Map and High Level Features},
  author = {Lee, Gayoung and Tai, Yu Wing and Kim, Junmo},
  date = {2016},
  journaltitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition},
  volume = {2016-Decem},
  eprint = {1604.05495},
  eprinttype = {arxiv},
  pages = {660--668},
  issn = {10636919},
  doi = {10.1109/CVPR.2016.78},
  abstract = {Recent advances in saliency detection have utilized deep learning to obtain high level features to detect salient regions in a scene. These advances have demonstrated superior results over previous works that utilize hand-crafted low level features for saliency detection. In this paper, we demonstrate that hand-crafted features can provide complementary information to enhance performance of saliency detection that utilizes only high level features. Our method utilizes both high level and low level features for saliency detection under a unified deep learning framework. The high level features are extracted using the VGG-net, and the low level features are compared with other parts of an image to form a low level distance map. The low level distance map is then encoded using a convolutional neural network(CNN) with multiple 1X1 convolutional and ReLU layers. We concatenate the encoded low level distance map and the high level features, and connect them to a fully connected neural network classifier to evaluate the saliency of a query region. Our experiments show that our method can further improve the performance of state-of-the-art deep learning-based saliency detection methods.},
  isbn = {9781467388504},
  file = {/Users/ryedida/Zotero/storage/7GXF2JEW/Lee, Tai, Kim - 2016 - Deep saliency with encoded low level distance map and high level features(2).pdf}
}

@article{Lee2018,
  title = {{{DeepHit}}: {{A Deep Learning Approach}} to {{Survival Analysis}} with {{Competing Risks}}},
  author = {Lee, Changhee and Zame, William R and Yoon, Jinsung and Van Der Schaar, Mihaela},
  date = {2018},
  journaltitle = {Thirty-Second AAAI Conference on Artificial Intelligence},
  url = {http://medianetlab.ee.ucla.edu/papers/AAAI_2018_DeepHit},
  abstract = {Survival analysis (time-to-event analysis) is widely used in economics and finance, engineering, medicine and many other areas. A fundamental problem is to understand the relationship between the covariates and the (distribution of) survival times (times-to-event). Much of the previous work has approached the problem by viewing the survival time as the first hitting time of a stochastic process, assuming a specific form for the underlying stochastic process, using available data to learn the relationship between the covariates and the parameters of the model, and then deducing the relationship between covariates and the distribution of first hitting times (the risk). However, previous models rely on strong parametric assumptions that are often violated. This paper proposes a very different ap-proach to survival analysis, DeepHit, that uses a deep neural network to learn the distribution of survival times directly. DeepHit makes no assumptions about the underlying stochas-tic process and allows for the possibility that the relationship between covariates and risk(s) changes over time. Most impor-tantly, DeepHit smoothly handles competing risks; i.e. settings in which there is more than one possible event of interest. Comparisons with previous models on the basis of real and synthetic datasets demonstrate that DeepHit achieves large and statistically significant performance improvements over previous state-of-the-art methods.},
  file = {/Users/ryedida/Zotero/storage/XQHNAAI4/Lee et al. - 2018 - DeepHit A Deep Learning Approach to Survival Analysis with Competing Risks(2).pdf}
}

@article{lee2020biobert,
  title = {{{BioBERT}}: A Pre-Trained Biomedical Language Representation Model for Biomedical Text Mining},
  author = {Lee, Jinhyuk and Yoon, Wonjin and Kim, Sungdong and Kim, Donghyeon and Kim, Sunkyu and So, Chan Ho and Kang, Jaewoo},
  date = {2020},
  journaltitle = {Bioinformatics},
  volume = {36},
  number = {4},
  pages = {1234--1240},
  publisher = {{Oxford University Press}}
}

@article{lee2020continual,
  title = {Continual Prediction of Bug-Fix Time Using Deep Learning-Based Activity Stream Embedding},
  author = {Lee, Youngseok and Lee, Suin and Lee, Chan-Gun and Yeom, Ikjun and Woo, Honguk},
  date = {2020},
  journaltitle = {IEEE access : practical innovations, open solutions},
  shortjournal = {IEEE Access},
  volume = {8},
  pages = {10503--10515},
  publisher = {{IEEE}}
}

@misc{leeLipschitzCertifiableTrainingTight2020,
  title = {Lipschitz-{{Certifiable Training}} with a {{Tight Outer Bound}}},
  author = {Lee, Sungyoon and Lee, Jaewook and Park, Saerom},
  date = {2020},
  journaltitle = {Advances in Neural Information Processing Systems},
  volume = {33},
  pages = {16891--16902},
  organization = {{Curran Associates, Inc.}},
  keywords = {NEURIPS2020\_c46482dd}
}

@report{leeSelfAttentionGraphPooling,
  title = {Self-{{Attention Graph Pooling}}},
  author = {Lee, Junhyun and Lee, Inyeop and Kang, Jaewoo},
  eprint = {1904.08082v4},
  eprinttype = {arxiv},
  url = {https://github.com/inyeoplee77/SAGPool},
  urldate = {2021-03-12},
  abstract = {Advanced methods of applying deep learning to structured data such as graphs have been proposed in recent years. In particular, studies have fo-cused on generalizing convolutional neural networks to graph data, which includes redefining the convolution and the downsampling (pooling) operations for graphs. The method of generalizing the convolution operation to graphs has been proven to improve performance and is widely used. However, the method of applying down-sampling to graphs is still difficult to perform and has room for improvement. In this paper, we propose a graph pooling method based on self-attention. Self-attention using graph convolution allows our pooling method to consider both node features and graph topology. To ensure a fair comparison, the same training procedures and model architectures were used for the existing pooling methods and our method. The experimental results demonstrate that our method achieves superior graph classification performance on the benchmark datasets using a reasonable number of parameters.}
}

@unpublished{leeSNIPSingleshotNetwork2019,
  title = {{{SNIP}}: {{Single-shot Network Pruning}} Based on {{Connection Sensitivity}}},
  shorttitle = {{{SNIP}}},
  author = {Lee, Namhoon and Ajanthan, Thalaiyasingam and Torr, Philip H. S.},
  date = {2019-02-23},
  eprint = {1810.02340},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/1810.02340},
  urldate = {2021-04-15},
  abstract = {Pruning large neural networks while maintaining their performance is often desirable due to the reduced space and time complexity. In existing methods, pruning is done within an iterative optimization procedure with either heuristically designed pruning schedules or additional hyperparameters, undermining their utility. In this work, we present a new approach that prunes a given network once at initialization prior to training. To achieve this, we introduce a saliency criterion based on connection sensitivity that identifies structurally important connections in the network for the given task. This eliminates the need for both pretraining and the complex pruning schedule while making it robust to architecture variations. After pruning, the sparse network is trained in the standard way. Our method obtains extremely sparse networks with virtually the same accuracy as the reference network on the MNIST, CIFAR-10, and Tiny-ImageNet classification tasks and is broadly applicable to various architectures including convolutional, residual and recurrent networks. Unlike existing methods, our approach enables us to demonstrate that the retained connections are indeed relevant to the given task.},
  langid = {english},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
  annotation = {209 citations (Semantic Scholar/arXiv) [2021-04-15]},
  file = {/Users/ryedida/Zotero/storage/MZ7XPPJK/Lee et al. - 2019 - SNIP Single-shot Network Pruning based on Connect.pdf}
}

@report{Lehtinen2018,
  title = {{{Noise2Noise}}: {{Learning Image Restoration}} without {{Clean Data}}},
  author = {Lehtinen, Jaakko and Munkberg, Jacob and Hasselgren, Jon and Laine, Samuli and Karras, Tero and Aittala, Miika and Aila, Timo},
  date = {2018},
  eprint = {1803.04189v2},
  eprinttype = {arxiv},
  url = {http://r0k.us/graphics/kodak/},
  abstract = {We apply basic statistical reasoning to signal reconstruction by machine learning-learning to map corrupted observations to clean signals-with a simple and powerful conclusion: it is possible to learn to restore images by only looking at corrupted examples, at performance at and sometimes exceeding training using clean data, without explicit image priors or likelihood models of the corruption. In practice, we show that a single model learns photographic noise removal, denois-ing synthetic Monte Carlo images, and reconstruction of undersampled MRI scans-all corrupted by different processes-based on noisy data only.},
  file = {/Users/ryedida/Zotero/storage/NQRWGBWM/Lehtinen et al. - 2018 - Noise2Noise Learning Image Restoration without Clean Data(2).pdf}
}

@unpublished{leinoGloballyRobustNeuralNetworks2021,
  title = {Globally-{{Robust Neural Networks}}},
  author = {Leino, Klas and Wang, Zifan and Fredrikson, Matt},
  date = {2021-02-16},
  eprint = {2102.08452},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/2102.08452},
  urldate = {2021-03-03},
  abstract = {The threat of adversarial examples has motivated work on training certifiably robust neural networks, to facilitate efficient verification of local robustness at inference time. We formalize a notion of global robustness, which captures the operational properties of on-line local robustness certification while yielding a natural learning objective for robust training. We show that widely-used architectures can be easily adapted to this objective by incorporating efficient global Lipschitz bounds into the network, yielding certifiably-robust models by construction that achieve state-of-the-art verifiable and clean accuracy. Notably, this approach requires significantly less time and memory than recent certifiable training methods, and leads to negligible costs when certifying points on-line; for example, our evaluation shows that it is possible to train a large tiny-imagenet model in a matter of hours. We posit that this is possible using inexpensive global bounds -- despite prior suggestions that tighter local bounds are needed for good performance -- because these models are trained to achieve tighter global bounds. Namely, we prove that the maximum achievable verifiable accuracy for a given dataset is not improved by using a local bound.}
}

@article{leshno1993multilayer,
  title = {Multilayer Feedforward Networks with a Nonpolynomial Activation Function Can Approximate Any Function},
  author = {Leshno, Moshe and Lin, Vladimir Ya and Pinkus, Allan and Schocken, Shimon},
  date = {1993},
  journaltitle = {Neural networks},
  volume = {6},
  number = {6},
  pages = {861--867},
  publisher = {{Elsevier}}
}

@article{Lessmann08,
  title = {Benchmarking {{Classification Models}} for {{Software Defect Prediction}}: {{A Proposed Framework}} and {{Novel Findings}}},
  author = {Lessmann, S and Baesens, B and Mues, C and Pietsch, S},
  date = {2008},
  journaltitle = {IEEE Transactions on Software Engineering},
  volume = {34},
  number = {4},
  pages = {485--496}
}

@article{Li2016,
  title = {A {{Multi-Task Learning Formulation}} for {{Survival Analysis}}},
  author = {Li, Yan and Wang, Jie and Ye, Jieping and Reddy, Chandan K.},
  date = {2016},
  journaltitle = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining - KDD '16},
  eprint = {438827},
  eprinttype = {pmid},
  pages = {1715--1724},
  doi = {10.1145/2939672.2939857},
  url = {http://dl.acm.org/citation.cfm?doid=2939672.2939857},
  abstract = {Predicting the occurrence of a particular event of interest at future time points is the primary goal of survival analysis. The presence of incomplete observations due to time limi-tations or loss of data traces is known as censoring which brings unique challenges in this domain and differentiates survival analysis from other standard regression methods. The popularly used survival analysis methods such as Cox proportional hazard model and parametric survival regres-sion suffer from some strict assumptions and hypotheses that are not realistic in most of the real-world applications. To overcome the weaknesses of these two types of methods, we reformulate the survival analysis problem as a multi-task learning problem and propose a new multi-task learning based formulation to predict the survival time by estimating the survival status at each time interval during the study du-ration. We propose an indicator matrix to enable the multi-task learning algorithm to handle censored instances and in-corporate some of the important characteristics of survival problems such as non-negative non-increasing list structure into our model through max-heap projection. We employ the l2,1-norm penalty to learn a shared representation across related tasks and hence select important features and alle-viate over-fitting in high-dimensional feature spaces; thus, reducing the prediction error of each task. To efficiently handle the two non-smooth constraints, in this paper, we propose an optimization method which employs Alternat-ing Direction Method of Multipliers (ADMM) algorithm to solve the proposed multi-task learning problem. We demon-strate the performance of the proposed method using real-world microarray gene expression datasets and show that our methods outperform state-of-the-art methods.},
  isbn = {9781450342322},
  keywords = {high-,multi-task learning,regularization,survival analysis},
  file = {/Users/ryedida/Zotero/storage/7XMV6ZMR/Li et al. - 2016 - A Multi-Task Learning Formulation for Survival Analysis(2).pdf}
}

@article{li2017hyperband,
  title = {Hyperband: {{A}} Novel Bandit-Based Approach to Hyperparameter Optimization},
  author = {Li, Lisha and Jamieson, Kevin and DeSalvo, Giulia and Rostamizadeh, Afshin and Talwalkar, Ameet},
  date = {2017},
  journaltitle = {The Journal of Machine Learning Research},
  volume = {18},
  number = {1},
  pages = {6765--6816},
  publisher = {{JMLR. org}}
}

@unpublished{Li2018,
  title = {Understanding the {{Disharmony}} between {{Dropout}} and {{Batch Normalization}} by {{Variance Shift}}},
  author = {Li, Xiang and Chen, Shuo and Hu, Xiaolin and Yang, Jian},
  date = {2018},
  eprint = {1801.05134},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1801.05134},
  abstract = {This paper first answers the question "why do the two most powerful techniques Dropout and Batch Normalization (BN) often lead to a worse performance when they are combined together?" in both theoretical and statistical aspects. Theoretically, we find that Dropout would shift the variance of a specific neural unit when we transfer the state of that network from train to test. However, BN would maintain its statistical variance, which is accumulated from the entire learning procedure, in the test phase. The inconsistency of that variance (we name this scheme as "variance shift") causes the unstable numerical behavior in inference that leads to more erroneous predictions finally, when applying Dropout before BN. Thorough experiments on DenseNet, ResNet, ResNeXt and Wide ResNet confirm our findings. According to the uncovered mechanism, we next explore several strategies that modifies Dropout and try to overcome the limitations of their combination by avoiding the variance shift risks.},
  file = {/Users/ryedida/Zotero/storage/7JE4UTXF/Li et al. - 2018 - Understanding the Disharmony between Dropout and Batch Normalization by Variance Shift(2).pdf}
}

@unpublished{li2018deep,
  title = {Deep {{Learning}} in {{Software Engineering}}},
  author = {Li, Xiaochen and Jiang, He and Ren, Zhilei and Li, Ge and Zhang, Jingxuan},
  date = {2018},
  eprint = {1805.04825},
  eprinttype = {arxiv}
}

@article{liAdversarialRobustnessAttention2021,
  title = {Adversarial Robustness via Attention Transfer},
  author = {Li, Zhuorong and Feng, Chao and Wu, Minghui and Yu, Hongchuan and Zheng, Jianwei and Zhu, Fanwei},
  date = {2021-06-01},
  journaltitle = {Pattern Recognition Letters},
  shortjournal = {Pattern Recognition Letters},
  volume = {146},
  pages = {172--178},
  issn = {0167-8655},
  doi = {10.1016/j.patrec.2021.03.011},
  url = {https://www.sciencedirect.com/science/article/pii/S0167865521000982},
  urldate = {2021-04-17},
  abstract = {Deep neural networks are known to be vulnerable to adversarial attacks. The empirical analysis in our study suggests that attacks tend to induce diverse network architectures to shift the attention to irrelevant regions. Motivated by this observation, we propose a regularization technique which enforces the attentions to be well aligned via the knowledge transfer mechanism, thereby encouraging the robustness. Resultant model exhibits unprecedented robustness, securing 63.81\% adversarial accuracy where the prior art is 51.59\% on CIFAR-10 dataset under PGD attacks. In addition, we go beyond performance to analytically investigate the proposed method as an effective defense. Significantly flattened loss landscape can be observed, demonstrating the promise of the proposed method for improving robustness and thus the deployment in security-sensitive settings.},
  langid = {english},
  keywords = {Adversarial defense,Representation learning,Robustness,Transfer learning,Visual attention},
  file = {/Users/ryedida/Zotero/storage/JPJ5KMBT/Li et al. - 2021 - Adversarial robustness via attention transfer.pdf;/Users/ryedida/Zotero/storage/FLH7A9BK/S0167865521000982.html}
}

@unpublished{liangCanFruitFly2021,
  title = {Can a {{Fruit Fly Learn Word Embeddings}}?},
  author = {Liang, Yuchen and Ryali, Chaitanya K. and Hoover, Benjamin and Grinberg, Leopold and Navlakha, Saket and Zaki, Mohammed J. and Krotov, Dmitry},
  date = {2021-03-14},
  eprint = {2101.06887},
  eprinttype = {arxiv},
  eprintclass = {cs, q-bio, stat},
  url = {http://arxiv.org/abs/2101.06887},
  urldate = {2022-01-01},
  abstract = {The mushroom body of the fruit fly brain is one of the best studied systems in neuroscience. At its core it consists of a population of Kenyon cells, which receive inputs from multiple sensory modalities. These cells are inhibited by the anterior paired lateral neuron, thus creating a sparse high dimensional representation of the inputs. In this work we study a mathematical formalization of this network motif and apply it to learning the correlational structure between words and their context in a corpus of unstructured text, a common natural language processing (NLP) task. We show that this network can learn semantic representations of words and can generate both static and context-dependent word embeddings. Unlike conventional methods (e.g., BERT, GloVe) that use dense representations for word embedding, our algorithm encodes semantic meaning of words and their context in the form of sparse binary hash codes. The quality of the learned representations is evaluated on word similarity analysis, word-sense disambiguation, and document classification. It is shown that not only can the fruit fly network motif achieve performance comparable to existing methods in NLP, but, additionally, it uses only a fraction of the computational resources (shorter training time and smaller memory footprint).},
  langid = {english},
  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing,Quantitative Biology - Neurons and Cognition,Statistics - Machine Learning},
  annotation = {3 citations (Semantic Scholar/arXiv) [2022-01-01]},
  file = {/Users/ryedida/Zotero/storage/93FBUEQK/Liang et al. - 2021 - Can a Fruit Fly Learn Word Embeddings.pdf}
}

@unpublished{liAreGenerativeClassifiers2019,
  title = {Are {{Generative Classifiers More Robust}} to {{Adversarial Attacks}}?},
  author = {Li, Yingzhen and Bradshaw, John and Sharma, Yash},
  date = {2019-05-27},
  eprint = {1802.06552},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1802.06552},
  urldate = {2021-04-09},
  abstract = {There is a rising interest in studying the robustness of deep neural network classifiers against adversaries, with both advanced attack and defence techniques being actively developed. However, most recent work focuses on discriminative classifiers, which only model the conditional distribution of the labels given the inputs. In this paper, we propose and investigate the deep Bayes classifier, which improves classical naive Bayes with conditional deep generative models. We further develop detection methods for adversarial examples, which reject inputs with low likelihood under the generative model. Experimental results suggest that deep Bayes classifiers are more robust than deep discriminative classifiers, and that the proposed detection methods are effective against many recently proposed attacks.},
  langid = {english},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  annotation = {35 citations (Semantic Scholar/arXiv) [2021-04-09]},
  file = {/Users/ryedida/Zotero/storage/B3AU9GCF/Li et al. - 2019 - Are Generative Classifiers More Robust to Adversar.pdf}
}

@online{liAutomatingCodeReview2022,
  title = {Automating {{Code Review Activities}} by {{Large-Scale Pre-training}}},
  author = {Li, Zhiyu and Lu, Shuai and Guo, Daya and Duan, Nan and Jannu, Shailesh and Jenks, Grant and Majumder, Deep and Green, Jared and Svyatkovskiy, Alexey and Fu, Shengyu and Sundaresan, Neel},
  date = {2022-10-11},
  eprint = {2203.09095},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2203.09095},
  urldate = {2023-10-06},
  abstract = {Code review is an essential part to software development lifecycle since it aims at guaranteeing the quality of codes. Modern code review activities necessitate developers viewing, understanding and even running the programs to assess logic, functionality, latency, style and other factors. It turns out that developers have to spend far too much time reviewing the code of their peers. Accordingly, it is in significant demand to automate the code review process. In this research, we focus on utilizing pre-training techniques for the tasks in the code review scenario. We collect a large-scale dataset of real-world code changes and code reviews from open-source projects in nine of the most popular programming languages. To better understand code diffs and reviews, we propose CodeReviewer, a pre-trained model that utilizes four pre-training tasks tailored specifically for the code review scenario. To evaluate our model, we focus on three key tasks related to code review activities, including code change quality estimation, review comment generation and code refinement. Furthermore, we establish a high-quality benchmark dataset based on our collected data for these three tasks and conduct comprehensive experiments on it. The experimental results demonstrate that our model outperforms the previous state-of-the-art pre-training approaches in all tasks. Further analysis show that our proposed pre-training tasks and the multilingual pre-training dataset benefit the model on the understanding of code changes and reviews.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Software Engineering},
  file = {/Users/ryedida/Zotero/storage/A8CLBPWN/Li et al. - 2022 - Automating Code Review Activities by Large-Scale P.pdf}
}

@article{liClassSumDeepLearning2023,
  title = {{{ClassSum}}: A Deep Learning Model for Class-Level Code Summarization},
  shorttitle = {{{ClassSum}}},
  author = {Li, Mingchen and Yu, Huiqun and Fan, Guisheng and Zhou, Ziyi and Huang, Jiawen},
  date = {2023-02-01},
  journaltitle = {Neural Computing and Applications},
  shortjournal = {Neural Comput \& Applic},
  volume = {35},
  number = {4},
  pages = {3373--3393},
  issn = {1433-3058},
  doi = {10.1007/s00521-022-07877-z},
  url = {https://doi.org/10.1007/s00521-022-07877-z},
  urldate = {2023-10-06},
  abstract = {Code summaries are clear and concise natural language descriptions of program entities. Meaningful code summaries assist developers in better understanding. Code summarization refers to the task of generating a natural language summary from a code snippet. Most researches on code summarization focus on automatically generating summaries for methods or functions. However, in an object-oriented language such as Java, class is the basic programming unit rather than method. To fill this gap, in this paper, we investigate how to generate summaries for Java classes utilizing deep learning-based approaches. We propose a novel encoder–decoder model called ClassSum to generate functionality descriptions for Java classes and build a dataset containing 172,639 {$<$}class, summary{$>$} pairs from 3185 repositories hosted on Github. Since the code of class is much longer and more complicated, encoding a whole class via neural network is more challenging than encoding a method. On the other hand, the content within a class may be incomplete. To overcome this difficulty, we reduce the code of a class by only keeping its key elements, namely class signatures, method signatures and attribute names. To utilize both lexical and structural information of code, our model takes token sequence and abstract syntax tree of the reduced class content as inputs. ClassSum and five baselines (designed for method-level code summarization) are evaluated on our dataset. Experiment results show that summaries generated by ClassSum are more accurate and readable than those generated by baselines. Our dataset is available at https://github.com/classsum/ClassSum.},
  langid = {english},
  keywords = {Class documentation,Code summarization,Deep learning,Program comprehension},
  file = {/Users/ryedida/Zotero/storage/ZCKXTYSF/Li et al_2023_ClassSum.pdf}
}

@report{Lin,
  title = {Comparisons of {{BKT}}, {{RNN}} and {{LSTM}} for {{Predicting Student Learning Gains}}},
  author = {Lin, Chen and Chi, Min},
  abstract = {The objective of this study is to develop effective computational models that can predict student learning gains, preferably as early as possible. We compared a series of Bayesian Knowledge Tracing (BKT) models against vanilla RNNs and Long Short Term Memory (LSTM) based models. Our results showed that the LSTM-based model achieved the highest accuracy and the RNN based model have the highest F1-measure. Interestingly, we found that RNN can achieve a reasonably accurate prediction of student final learning gains using only the first 40\% of the entire training sequence; using the first 70\% of the sequence would produce a result comparable to using the entire sequence.},
  keywords = {BKT,Learning Gain Prediction,LSTM,RNN},
  file = {/Users/ryedida/Zotero/storage/H4WKDDE2/Lin, Chi - Unknown - Comparisons of BKT, RNN and LSTM for Predicting Student Learning Gains(2).pdf}
}

@inproceedings{Lin2016,
  title = {Intervention-{{BKT}}: {{Incorporating}} Instructional Interventions into {{Bayesian}} Knowledge Tracing},
  booktitle = {Lecture {{Notes}} in {{Computer Science}} (Including Subseries {{Lecture Notes}} in {{Artificial Intelligence}} and {{Lecture Notes}} in {{Bioinformatics}})},
  author = {Lin, Chen and Chi, Min},
  date = {2016},
  eprint = {25246403},
  eprinttype = {pmid},
  issn = {16113349},
  doi = {10.1007/978-3-319-39583-8_20},
  abstract = {Bayesian Knowledge Tracing (BKT) is one of the most widely adopted student modeling methods in Intelligent Tutoring Sys-tems (ITSs). Conventional BKT mainly leverages sequences of obser-vations (e.g. correct, incorrect) from student-system interaction log files to infer student latent knowledge states (e.g. unlearned, learned). However, the model does not take into account the instructional inter-ventions that generate those observations. On the other hand, we hypothesized that various types of instructional interventions can impact student's latent states differently. Therefore, we proposed a new student model called Intervention-Bayesian Knowledge Tracing (Intervention-BKT). Our results showed the new model outperforms conventional BKT and two factor analysis based alternatives: Additive Factor Model (AFM) and Instructional Factor Model (IFM); moreover, the learned parameters of Intervention-BKT can recommend adaptive pedagogical policies.},
  isbn = {978-3-319-39582-1},
  keywords = {Hidden Markov Model,Input Output Hidden Markov Model,Instructional intervention,Knowledge tracing,Student modeling},
  file = {/Users/ryedida/Zotero/storage/TFA2KERE/Lin, Chi - 2016 - Intervention-BKT Incorporating instructional interventions into Bayesian knowledge tracing(2).pdf}
}

@inproceedings{Lin2016a,
  title = {Incorporating {{Student Response Time}} and {{Tutor Instructional Interventions}} into {{Student Modeling}}},
  booktitle = {Proceedings of the 2016 {{Conference}} on {{User Modeling Adaptation}} and {{Personalization}} - {{UMAP}} '16},
  author = {Lin, Chen and Shen, Shitian and Chi, Min},
  date = {2016},
  doi = {10.1145/2930238.2930291},
  abstract = {Bayesian Knowledge Tracing (BKT) is one of the most widely adopted student-modeling methods. It uses performance (incorrect, correct) to infer student knowledge state (un-learned, learned). However, performance can be noisy and thus we explored another type of observations – student re-sponse time. Furthermore, we proposed Intervention Bayesian Knowledge Tracing (Intervention-BKT) which can incorpo-rate multiple types of instructional interventions into the conventional BKT model. Our results show that for next-step performance predictions, Intervention-BKT is more ef-fective than BKT; whereas to predict students' post-test scores, including student response time would yield better result than using performance alone.},
  isbn = {978-1-4503-4368-8},
  file = {/Users/ryedida/Zotero/storage/F3AH5KUY/Lin, Shen, Chi - 2016 - Incorporating Student Response Time and Tutor Instructional Interventions into Student Modeling(2).pdf}
}

@inproceedings{lin2017focal,
  title = {Focal Loss for Dense Object Detection},
  booktitle = {Proceedings of the {{IEEE}} International Conference on Computer Vision},
  author = {Lin, Tsung-Yi and Goyal, Priya and Girshick, Ross and He, Kaiming and Dollár, Piotr},
  date = {2017},
  pages = {2980--2988}
}

@unpublished{Liu2016,
  title = {Attention {{Correctness}} in {{Neural Image Captioning}}},
  author = {Liu, Chenxi and Mao, Junhua and Sha, Fei and Yuille, Alan},
  date = {2016},
  eprint = {1605.09553},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1605.09553},
  abstract = {Attention mechanisms have recently been introduced in deep learning for various tasks in natural language processing and computer vision. But despite their popularity, the "correctness" of the implicitly-learned attention maps has only been assessed qualitatively by visualization of several examples. In this paper we focus on evaluating and improving the correctness of attention in neural image captioning models. Specifically, we propose a quantitative evaluation metric for the consistency between the generated attention maps and human annotations, using recently released datasets with alignment between regions in images and entities in captions. We then propose novel models with different levels of explicit supervision for learning attention maps during training. The supervision can be strong when alignment between regions and caption entities are available, or weak when only object segments and categories are provided. We show on the popular Flickr30k and COCO datasets that introducing supervision of attention maps during training solidly improves both attention correctness and caption quality, showing the promise of making machine perception more human-like.},
  isbn = {9781339831046},
  file = {/Users/ryedida/Zotero/storage/AY4XCX6W/Liu et al. - 2016 - Attention Correctness in Neural Image Captioning(2).pdf}
}

@unpublished{liu2017hierarchical,
  title = {Hierarchical Representations for Efficient Architecture Search},
  author = {Liu, Hanxiao and Simonyan, Karen and Vinyals, Oriol and Fernando, Chrisantha and Kavukcuoglu, Koray},
  date = {2017},
  eprint = {1711.00436},
  eprinttype = {arxiv}
}

@unpublished{liu2018darts,
  title = {Darts: {{Differentiable}} Architecture Search},
  author = {Liu, Hanxiao and Simonyan, Karen and Yang, Yiming},
  date = {2018},
  eprint = {1806.09055},
  eprinttype = {arxiv}
}

@unpublished{Liu2019,
  title = {Algorithms for {{Verifying Deep Neural Networks}}},
  author = {Liu, Changliu and Arnon, Tomer and Lazarus, Christopher and Barrett, Clark and Kochenderfer, Mykel J.},
  date = {2019},
  eprint = {1903.06758},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1903.06758},
  abstract = {Deep neural networks are widely used for nonlinear function approximation with applications ranging from computer vision to control. Although these networks involve the composition of simple arithmetic operations, it can be very challenging to verify whether a particular network satisfies certain input-output properties. This article surveys methods that have emerged recently for soundly verifying such properties. These methods borrow insights from reachability analysis, optimization, and search. We discuss fundamental differences and connections between existing algorithms. In addition, we provide pedagogical implementations of existing methods and compare them on a set of benchmark problems.},
  file = {/Users/ryedida/Zotero/storage/249J9Y5I/Liu et al. - 2019 - Algorithms for Verifying Deep Neural Networks(2).pdf}
}

@inproceedings{liu2019auto,
  title = {Auto-Deeplab: {{Hierarchical}} Neural Architecture Search for Semantic Image Segmentation},
  booktitle = {Proceedings of the {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  author = {Liu, Chenxi and Chen, Liang-Chieh and Schroff, Florian and Adam, Hartwig and Hua, Wei and Yuille, Alan L and Fei-Fei, Li},
  date = {2019},
  pages = {82--92}
}

@inproceedings{liuImprovingInterpretabilityDeep2018,
  title = {Improving the {{Interpretability}} of {{Deep Neural Networks}} with {{Knowledge Distillation}}},
  booktitle = {2018 {{IEEE International Conference}} on {{Data Mining Workshops}} ({{ICDMW}})},
  author = {Liu, Xuan and Wang, Xiaoguang and Matwin, Stan},
  date = {2018-11},
  pages = {905--912},
  issn = {2375-9259},
  doi = {10.1109/ICDMW.2018.00132},
  url = {https://ieeexplore.ieee.org/abstract/document/8637552?casa_token=Xup-dJgnYnsAAAAA%3Ayhq4UZSl3osUXjgR6nl0YUBXW65RbOa9XrCavmaiKggZT0cu7kH5KO93PDVEmwmfduZbiB6m},
  urldate = {2023-10-30},
  abstract = {Deep Neural Networks have achieved huge success at a wide spectrum of applications from language modeling, computer vision to speech recognition. However, nowadays, good performance alone is not enough to satisfy the needs of practical deployment where interpretability is demanded for cases involving ethics and mission critical applications. The complex models of Deep Neural Networks make it hard to understand and reason the predictions, which hinders its further progress. To tackle this problem, we apply the Knowledge Distillation technique to distill Deep Neural Networks into decision trees in order to attain good performance and interpretability simultaneously. We formulate the problem at hand as a multi-output regression problem and the experiments demonstrate that the student model achieves significantly better accuracy performance (about 1\% to 5\%) than vanilla decision trees at the same level of tree depth. The experiments are implemented on the TensorFlow platform to make it scalable to big datasets. To the best of our knowledge, we are the first to distill Deep Neural Networks into vanilla decision trees on multi-class datasets.},
  eventtitle = {2018 {{IEEE International Conference}} on {{Data Mining Workshops}} ({{ICDMW}})},
  file = {/Users/ryedida/Zotero/storage/FZTRK2QU/Liu et al_2018_Improving the Interpretability of Deep Neural Networks with Knowledge.pdf;/Users/ryedida/Zotero/storage/HLQ794RB/8637552.html}
}

@online{liuRegularizingDeepNeural2023,
  title = {Regularizing {{Deep Neural Networks}} with {{Stochastic Estimators}} of {{Hessian Trace}}},
  author = {Liu, Yucong and Yu, Shixing and Lin, Tong},
  date = {2023-02-21},
  eprint = {2208.05924},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2208.05924},
  urldate = {2023-09-05},
  abstract = {In this paper, we develop a novel regularization method for deep neural networks by penalizing the trace of Hessian. This regularizer is motivated by a recent guarantee bound of the generalization error. We explain its benefits in finding flat minima and avoiding Lyapunov stability in dynamical systems. We adopt the Hutchinson method as a classical unbiased estimator for the trace of a matrix and further accelerate its calculation using a dropout scheme. Experiments demonstrate that our method outperforms existing regularizers and data augmentation methods, such as Jacobian, Confidence Penalty, Label Smoothing, Cutout, and Mixup.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/K6THDUCR/Liu et al. - 2023 - Regularizing Deep Neural Networks with Stochastic .pdf}
}

@unpublished{liVisualizingLossLandscape2017,
  title = {Visualizing the {{Loss Landscape}} of {{Neural Nets}}},
  author = {Li, Hao and Xu, Zheng and Taylor, Gavin and Studer, Christoph and Goldstein, Tom},
  date = {2017},
  eprint = {25246403},
  eprinttype = {pmid},
  issn = {1752-0894},
  doi = {10.1038/NGEO921},
  url = {http://arxiv.org/abs/1712.09913},
  abstract = {Neural network training relies on our ability to find "good" minimizers of highly non-convex loss functions. It is well-known that certain network architecture designs (e.g., skip connections) produce loss functions that train easier, and well-chosen training parameters (batch size, learning rate, optimizer) produce minimizers that generalize better. However, the reasons for these differences, and their effects on the underlying loss landscape, are not well understood. In this paper, we explore the structure of neural loss functions, and the effect of loss landscapes on generalization, using a range of visualization methods. First, we introduce a simple "filter normalization" method that helps us visualize loss function curvature and make meaningful side-by-side comparisons between loss functions. Then, using a variety of visualizations, we explore how network architecture affects the loss landscape, and how training parameters affect the shape of minimizers.},
  isbn = {1752-0894},
  issue = {Nips 2018},
  file = {/Users/ryedida/Zotero/storage/XCDVXKJB/Li et al. - 2017 - Visualizing the Loss Landscape of Neural Nets(2).pdf}
}

@unpublished{Lomuscio2017,
  title = {An Approach to Reachability Analysis for Feed-Forward {{ReLU}} Neural Networks},
  author = {Lomuscio, Alessio and Maganti, Lalit},
  date = {2017},
  eprint = {1706.07351},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1706.07351},
  abstract = {We study the reachability problem for systems implemented as feed-forward neural networks whose activation function is implemented via ReLU functions. We draw a correspondence between establishing whether some arbitrary output can ever be outputed by a neural system and linear problems characterising a neural system of interest. We present a methodology to solve cases of practical interest by means of a state-of-the-art linear programs solver. We evaluate the technique presented by discussing the experimental results obtained by analysing reachability properties for a number of benchmarks in the literature.},
  file = {/Users/ryedida/Zotero/storage/WR77FB2N/Lomuscio, Maganti - 2017 - An approach to reachability analysis for feed-forward ReLU neural networks(2).pdf}
}

@unpublished{loshchilovDecoupledWeightDecay2019,
  title = {Decoupled {{Weight Decay Regularization}}},
  author = {Loshchilov, Ilya and Hutter, Frank},
  date = {2019-01-04},
  eprint = {1711.05101},
  eprinttype = {arxiv},
  eprintclass = {cs, math},
  url = {http://arxiv.org/abs/1711.05101},
  urldate = {2021-04-15},
  abstract = {L\$\_2\$ regularization and weight decay regularization are equivalent for standard stochastic gradient descent (when rescaled by the learning rate), but as we demonstrate this is \textbackslash emph\{not\} the case for adaptive gradient algorithms, such as Adam. While common implementations of these algorithms employ L\$\_2\$ regularization (often calling it "weight decay" in what may be misleading due to the inequivalence we expose), we propose a simple modification to recover the original formulation of weight decay regularization by \textbackslash emph\{decoupling\} the weight decay from the optimization steps taken w.r.t. the loss function. We provide empirical evidence that our proposed modification (i) decouples the optimal choice of weight decay factor from the setting of the learning rate for both standard SGD and Adam and (ii) substantially improves Adam's generalization performance, allowing it to compete with SGD with momentum on image classification datasets (on which it was previously typically outperformed by the latter). Our proposed decoupled weight decay has already been adopted by many researchers, and the community has implemented it in TensorFlow and PyTorch; the complete source code for our experiments is available at https://github.com/loshchil/AdamW-and-SGDW},
  langid = {english},
  keywords = {Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing,Mathematics - Optimization and Control},
  annotation = {301 citations (Semantic Scholar/arXiv) [2021-04-15]},
  file = {/Users/ryedida/Zotero/storage/ADRMDYCA/Loshchilov and Hutter - 2019 - Decoupled Weight Decay Regularization.pdf}
}

@online{loshchilovSGDRStochasticGradient2017,
  title = {{{SGDR}}: {{Stochastic Gradient Descent}} with {{Warm Restarts}}},
  shorttitle = {{{SGDR}}},
  author = {Loshchilov, Ilya and Hutter, Frank},
  date = {2017-05-03},
  eprint = {1608.03983},
  eprinttype = {arxiv},
  eprintclass = {cs, math},
  url = {http://arxiv.org/abs/1608.03983},
  urldate = {2024-01-22},
  abstract = {Restart techniques are common in gradient-free optimization to deal with multimodal functions. Partial warm restarts are also gaining popularity in gradient-based optimization to improve the rate of convergence in accelerated gradient schemes to deal with ill-conditioned functions. In this paper, we propose a simple warm restart technique for stochastic gradient descent to improve its anytime performance when training deep neural networks. We empirically study its performance on the CIFAR-10 and CIFAR-100 datasets, where we demonstrate new state-of-the-art results at 3.14\% and 16.21\%, respectively. We also demonstrate its advantages on a dataset of EEG recordings and on a downsampled version of the ImageNet dataset. Our source code is available at https://github.com/loshchil/SGDR},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing,Mathematics - Optimization and Control},
  file = {/Users/ryedida/Zotero/storage/AUEXMAQF/Loshchilov and Hutter - 2017 - SGDR Stochastic Gradient Descent with Warm Restar.pdf}
}

@article{luoNovelDatasetspecificFeature2020,
  title = {A Novel Dataset-Specific Feature Extractor for Zero-Shot Learning},
  author = {Luo, Yuxuan and Wang, Xizhao and Cao, Weipeng},
  date = {2020-05-28},
  journaltitle = {Neurocomputing},
  shortjournal = {Neurocomputing},
  volume = {391},
  pages = {74--82},
  issn = {0925-2312},
  doi = {10.1016/j.neucom.2020.01.069},
  url = {https://www.sciencedirect.com/science/article/pii/S0925231220301211},
  urldate = {2021-03-29},
  abstract = {Most of the existing Zero-Shot Learning(ZSL) algorithms adopt pre-trained neural networks as their feature extractors. Since these pre-trained models are not specially designed for ZSL tasks, it is difficult to guarantee the stability and generalization ability of the ZSL algorithms due to the feature mismatch. To alleviate this problem, we propose a novel dataset-specific feature extractor for ZSL according to an attribute-based label tree. Specifically, an attribute-based label tree is firstly built via K-means clustering and then the information extracted from the label tree is used to fine-tune the parameters of the pre-trained models in order to make the extracted features more suitable for the current ZSL task. The experimental results on three typical ZSL datasets show that our approach can effectively improve the predictive accuracy of the existing ZSL algorithms and significantly accelerate their convergence rate. Additionally we explain the experimental phenomena from the perspective of feature visualization, which experimentally show that the features extracted by our method are much more separable than those of the original pre-trained models.},
  langid = {english},
  keywords = {Feature extractor,Label tree,Residual networks,Zero shot learning},
  annotation = {3 citations (Semantic Scholar/DOI) [2021-03-29]},
  file = {/Users/ryedida/Zotero/storage/3HCTSXDL/Luo et al. - 2020 - A novel dataset-specific feature extractor for zer.pdf;/Users/ryedida/Zotero/storage/R65H5KIQ/S0925231220301211.html}
}

@article{Ma2018,
  title = {{{DeepGauge}}: {{Multi-granularity}} Testing Criteria for Deep Learning Systems},
  author = {Ma, Lei and Juefei-Xu, Felix and Zhang, Fuyuan and Sun, Jiyuan and Xue, Minhui and Li, Bo and Chen, Chunyang and Su, Ting and Li, Li and Liu, Yang and Zhao, Jianjun and Wang, Yadong},
  date = {2018},
  journaltitle = {ASE 2018 - Proceedings of the 33rd ACM/IEEE International Conference on Automated Software Engineering},
  eprint = {1803.07519},
  eprinttype = {arxiv},
  pages = {120--131},
  doi = {10.1145/3238147.3238202},
  abstract = {Deep learning (DL) defines a new data-driven programming paradigm that constructs the internal system logic of a crafted neuron network through a set of training data. We have seen wide adoption of DL in many safety-critical scenarios. However, a plethora of studies have shown that the state-of-the-art DL systems suffer from various vulnerabilities which can lead to severe consequences when applied to real-world applications. Currently, the testing adequacy of a DL system is usually measured by the accuracy of test data. Considering the limitation of accessible high quality test data, good accuracy performance on test data can hardly provide confidence to the testing adequacy and generality of DL systems. Unlike traditional software systems that have clear and controllable logic and functionality, the lack of interpretability in a DL system makes system analysis and defect detection difficult, which could potentially hinder its real-world deployment. In this paper, we propose DeepGauge, a set of multi-granularity testing criteria for DL systems, which aims at rendering a multi-faceted portrayal of the testbed. The in-depth evaluation of our proposed testing criteria is demonstrated on two well-known datasets, five DL systems, and with four state-of-the-art adversarial attack techniques against DL. The potential usefulness of DeepGauge sheds light on the construction of more generic and robust DL systems.},
  isbn = {9781450359375},
  keywords = {Deep learning,Deep neural networks,Software testing,Testing criteria},
  file = {/Users/ryedida/Zotero/storage/H4ELPUD4/Ma et al. - 2018 - DeepGauge Multi-granularity testing criteria for deep learning systems(2).pdf}
}

@article{Madry2018,
  title = {Towards Deep Learning Models Resistant to Adversarial Attacks},
  author = {Madry, Aleksander and Makelov, Aleksandar and Schmidt, Ludwig and Tsipras, Dimitris and Vladu, Adrian},
  date = {2018},
  journaltitle = {6th International Conference on Learning Representations, ICLR 2018 - Conference Track Proceedings},
  eprint = {1706.06083},
  eprinttype = {arxiv},
  pages = {1--28},
  abstract = {Recent work has demonstrated that neural networks are vulnerable to adversarial examples, i.e., inputs that are almost indistinguishable from natural data and yet classified incorrectly by the network. To address this problem, we study the adversarial robustness of neural networks through the lens of robust optimization. This approach provides us with a broad and unifying view on much prior work on this topic. Its principled nature also enables us to identify methods for both training and attacking neural networks that are reliable and, in a certain sense, universal. In particular, they specify a concrete security guarantee that would protect against a well-defined class of adversaries. These methods let us train networks with significantly improved resistance to a wide range of adversarial attacks. They also suggest robustness against a first-order adversary as a natural security guarantee. We believe that robustness against such well-defined classes of adversaries is an important stepping stone towards fully resistant deep learning models.},
  file = {/Users/ryedida/Zotero/storage/SM7KJVKR/Madry et al. - 2018 - Towards deep learning models resistant to adversarial attacks(2).pdf}
}

@article{maEasytoDeployAPIExtraction2019,
  title = {Easy-to-{{Deploy API Extraction}} by {{Multi-Level Feature Embedding}} and {{Transfer Learning}}},
  author = {Ma, Suyu and Xing, Zhenchang and Chen, Chunyang and Chen, Cheng and Qu, Lizhen and Li, Guoqiang},
  date = {2019},
  journaltitle = {IEEE Transactions on Software Engineering},
  pages = {1--1},
  issn = {1939-3520},
  doi = {10.1109/TSE.2019.2946830},
  abstract = {Application Programming Interfaces (APIs) have been widely discussed on social-technical platforms (e.g., Stack Overflow). Extracting API mentions from such informal software texts is the prerequisite for API-centric search and summarization of programming knowledge. Machine learning based API extraction has demonstrated superior performance than rule-based methods in informal software texts that lack consistent writing forms and annotations. However, machine learning based methods have a significant overhead in preparing training data and effective features. In this paper, we propose a multi-layer neural network based architecture for API extraction. Our architecture automatically learns character-, word- and sentence-level features from the input texts, thus removing the need for manual feature engineering and the dependence on advanced features (e.g., API gazzetter) beyond the input texts. We also propose to adopt transfer learning to adapt a source-library-trained model to a target-library, thus reducing the overhead of manual training-data labeling when the software text of multiple programming languages and libraries need to be processed. We conduct extensive experiments with six libraries of four programming languages which support diverse functionalities and have different API-naming and API-mention characteristics. Our experiments investigate the performance of our neural architecture for API extraction in informal software texts, the importance of different features, the effectiveness of transfer learning. Our results confirm not only the superior performance of our neural architecture than existing machine learning based methods for API extraction in informal software texts, but also the easy-to-deploy characteristic of our neural architecture.},
  eventtitle = {{{IEEE Transactions}} on {{Software Engineering}}},
  keywords = {API extraction,CNN,Computer architecture,Feature extraction,Libraries,LSTM,Machine learning,Manuals,Software,Training data,Transfer learning,Word embedding},
  annotation = {10 citations (Semantic Scholar/DOI) [2021-06-12]},
  file = {/Users/ryedida/Zotero/storage/M7TIFDM8/Ma et al. - 2019 - Easy-to-Deploy API Extraction by Multi-Level Featu.pdf;/Users/ryedida/Zotero/storage/ENTALFVD/8865646.html}
}

@unpublished{maGraphConvolutionalNetworks2018,
  title = {Graph {{Convolutional Networks}} with {{EigenPooling}}},
  author = {Ma, Yao and Wang, Suhang and Aggarwa, Charu C and Tang, Jiliang},
  date = {2018},
  eprint = {1904.13107v1},
  eprinttype = {arxiv},
  pages = {10},
  publisher = {{ACM}},
  doi = {10.1145/1122445.1122456},
  url = {http://cse.msu.edu/~mayao4/code/eigen_pooling.zip},
  urldate = {2021-03-05},
  abstract = {Graph neural networks, which generalize deep neural network models to graph structured data, have attracted increasing attention in recent years. They usually learn node representations by transforming, propagating and aggregating node features and have been proven to improve the performance of many graph related tasks such as node classification and link prediction. To apply graph neural networks for the graph classification task, approaches to generate the graph representation from node representations are demanded. A common way is to globally combine the node representations. However, rich structural information is overlooked. Thus a hierarchical pooling procedure is desired to preserve the graph structure during the graph representation learning. There are some recent works on hierarchically learning graph representation analogous to the pooling step in conventional convolutional neural (CNN) networks. However, the local structural information is still largely neglected during the pooling process. In this paper, we introduce a pooling operator EigenPooling based on graph Fourier transform, which can utilize the node features and local structures during the pooling process. We then design pooling layers based on the pooling operator, which are further combined with traditional GCN convolutional layers to form a graph neural network framework EigenGCN for graph classification. Theoretical analysis is provided to understand EigenPooling from both local and global perspectives. Experimental results of the graph classification task on 6 commonly used benchmarks demonstrate the effectiveness of the proposed framework. The code of this paper is available at},
  isbn = {9781450399999}
}

@inproceedings{mahendranUnderstandingDeepImage2015,
  title = {Understanding Deep Image Representations by Inverting Them},
  booktitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  author = {Mahendran, Aravindh and Vedaldi, Andrea},
  date = {2015-06},
  pages = {5188--5196},
  publisher = {{IEEE}},
  location = {{Boston, MA, USA}},
  doi = {10.1109/CVPR.2015.7299155},
  url = {http://ieeexplore.ieee.org/document/7299155/},
  urldate = {2021-04-09},
  abstract = {Image representations, from SIFT and Bag of Visual Words to Convolutional Neural Networks (CNNs), are a crucial component of almost any image understanding system. Nevertheless, our understanding of them remains limited. In this paper we conduct a direct analysis of the visual information contained in representations by asking the following question: given an encoding of an image, to which extent is it possible to reconstruct the image itself? To answer this question we contribute a general framework to invert representations. We show that this method can invert representations such as HOG more accurately than recent alternatives while being applicable to CNNs too. We then use this technique to study the inverse of recent state-of-theart CNN image representations for the first time. Among our findings, we show that several layers in CNNs retain photographically accurate information about the image, with different degrees of geometric and photometric invariance.},
  eventtitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  isbn = {978-1-4673-6964-0},
  langid = {english},
  annotation = {1197 citations (Semantic Scholar/DOI) [2021-04-09]},
  file = {/Users/ryedida/Zotero/storage/RJ53949B/Mahendran and Vedaldi - 2015 - Understanding deep image representations by invert.pdf}
}

@article{Mahloujifar2019,
  title = {The {{Curse}} of {{Concentration}} in {{Robust Learning}}: {{Evasion}} and {{Poisoning Attacks}} from {{Concentration}} of {{Measure}}},
  author = {Mahloujifar, Saeed and Diochnos, Dimitrios I. and Mahmoody, Mohammad},
  date = {2019},
  journaltitle = {Proceedings of the AAAI Conference on Artificial Intelligence},
  volume = {33},
  eprint = {1809.03063},
  eprinttype = {arxiv},
  pages = {4536--4543},
  issn = {2159-5399},
  doi = {10.1609/aaai.v33i01.33014536},
  abstract = {Many modern machine learning classifiers are shown to be vulnerable to adversarial perturbations of the instances. Despite a massive amount of work focusing on making classifiers robust, the task seems quite challenging. In this work, through a theoretical study, we investigate the adversarial risk and robustness of classifiers and draw a connection to the well-known phenomenon of “concentration of measure” in metric measure spaces. We show that if the metric probability space of the test instance is concentrated, any classifier with some initial constant error is inherently vulnerable to adversarial perturbations.One class of concentrated metric probability spaces are the so-called Lévy families that include many natural distributions. In this special case, our attacks only need to perturb the test instance by at most O(√n) to make it misclassified, where n is the data dimension. Using our general result about Lévy instance spaces, we first recover as special case some of the previously proved results about the existence of adversarial examples. However, many more Lévy families are known (e.g., product distribution under the Hamming distance) for which we immediately obtain new attacks that find adversarial examples of distance O(√n).Finally, we show that concentration of measure for product spaces implies the existence of forms of “poisoning” attacks in which the adversary tampers with the training data with the goal of degrading the classifier. In particular, we show that for any learning algorithm that uses m training examples, there is an adversary who can increase the probability of any “bad property” (e.g., failing on a particular test instance) that initially happens with non-negligible probability to ≈ 1 by substituting only Õe(√m) of the examples with other (still correctly labeled) examples.}
}

@article{malkomesParetoEfficientFrontier,
  title = {Beyond the {{Pareto Efficient Frontier}}: {{Constraint Active Search}} for {{Multiobjective Experimental Design}}},
  author = {Malkomes, Gustavo and Cheng, Bolong and Lee, Eric Hans and McCourt, Michael},
  pages = {12},
  abstract = {Many problems in engineering design and simulation require balancing competing objectives under the presence of uncertainty. Sample-efficient multiobjective optimization methods focus on the objective function values in metric space and ignore the sampling behavior of the design configurations in parameter space. Consequently, they may provide little actionable insight on how to choose designs in the presence of metric uncertainty or limited precision when implementing a chosen design. We propose a new formulation that accounts for the importance of the parameter space and is thus more suitable for multiobjective design problems; instead of searching for the Paretoefficient frontier, we solicit the desired minimum performance thresholds on all objectives to define regions of satisfaction. We introduce an active search algorithm called Expected Coverage Improvement (ECI) to efficiently discover the region of satisfaction and simultaneously sample diverse acceptable configurations. We demonstrate our algorithm on several design and simulation domains: mechanical design, additive manufacturing, medical monitoring, and plasma physics.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/SU6SCTM5/Malkomes et al. - Beyond the Pareto Efficient Frontier Constraint A.pdf}
}

@online{mallikPriorBandPracticalHyperparameter2023,
  title = {{{PriorBand}}: {{Practical Hyperparameter Optimization}} in the {{Age}} of {{Deep Learning}}},
  shorttitle = {{{PriorBand}}},
  author = {Mallik, Neeratyoy and Bergman, Edward and Hvarfner, Carl and Stoll, Danny and Janowski, Maciej and Lindauer, Marius and Nardi, Luigi and Hutter, Frank},
  date = {2023-11-15},
  eprint = {2306.12370},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2306.12370},
  urldate = {2023-12-09},
  abstract = {Hyperparameters of Deep Learning (DL) pipelines are crucial for their downstream performance. While a large number of methods for Hyperparameter Optimization (HPO) have been developed, their incurred costs are often untenable for modern DL. Consequently, manual experimentation is still the most prevalent approach to optimize hyperparameters, relying on the researcher’s intuition, domain knowledge, and cheap preliminary explorations. To resolve this misalignment between HPO algorithms and DL researchers, we propose PriorBand, an HPO algorithm tailored to DL, able to utilize both expert beliefs and cheap proxy tasks. Empirically, we demonstrate PriorBand’s efficiency across a range of DL benchmarks and show its gains under informative expert input and robustness against poor expert beliefs.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/93VP3B8R/Mallik et al. - 2023 - PriorBand Practical Hyperparameter Optimization i.pdf}
}

@inproceedings{mani2019deeptriage,
  title = {Deeptriage: {{Exploring}} the Effectiveness of Deep Learning for Bug Triaging},
  booktitle = {Proceedings of the {{ACM India}} Joint International Conference on Data Science and Management of Data},
  author = {Mani, Senthil and Sankaran, Anush and Aralikatte, Rahul},
  date = {2019},
  pages = {171--179}
}

@article{mann1947test,
  title = {On a Test of Whether One of Two Random Variables Is Stochastically Larger than the Other},
  author = {Mann, Henry B and Whitney, Donald R},
  date = {1947},
  journaltitle = {The annals of mathematical statistics},
  pages = {50--60},
  publisher = {{JSTOR}}
}

@inproceedings{manziniBlackCriminalCaucasian2019,
  title = {Black Is to {{Criminal}} as {{Caucasian}} Is to {{Police}}: {{Detecting}} and {{Removing Multiclass Bias}} in {{Word Embeddings}}},
  shorttitle = {Black Is to {{Criminal}} as {{Caucasian}} Is to {{Police}}},
  booktitle = {Proceedings of the 2019 {{Conference}} of the {{North}}},
  author = {Manzini, Thomas and Yao Chong, Lim and Black, Alan W and Tsvetkov, Yulia},
  date = {2019},
  pages = {615--621},
  publisher = {{Association for Computational Linguistics}},
  location = {{Minneapolis, Minnesota}},
  doi = {10.18653/v1/N19-1062},
  url = {http://aclweb.org/anthology/N19-1062},
  urldate = {2023-12-09},
  abstract = {Online texts—across genres, registers, domains, and styles—are riddled with human stereotypes, expressed in overt or subtle ways. Word embeddings, trained on these texts, perpetuate and amplify these stereotypes, and propagate biases to machine learning models that use word embeddings as features. In this work, we propose a method to debias word embeddings in multiclass settings such as race and religion, extending the work of (Bolukbasi et al., 2016) from the binary setting, such as binary gender. Next, we propose a novel methodology for the evaluation of multiclass debiasing. We demonstrate that our multiclass debiasing is robust and maintains the efficacy in standard NLP tasks.},
  eventtitle = {Proceedings of the 2019 {{Conference}} of the {{North}}},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/EQR5WSE6/Manzini et al. - 2019 - Black is to Criminal as Caucasian is to Police De.pdf}
}

@unpublished{marcus2018deep,
  title = {Deep Learning: {{A}} Critical Appraisal},
  author = {Marcus, Gary},
  date = {2018},
  eprint = {1801.00631},
  eprinttype = {arxiv}
}

@article{margeloiuGCondNetNovelMethod,
  title = {{{GCondNet}}: {{A Novel Method}} for {{Improving Neural Networks}} on {{Small High-Dimensional Tabular Data}}},
  author = {Margeloiu, Andrei and Lio, Pietro and Simidjievski, Nikola and Jamnik, Mateja},
  abstract = {Neural network models often struggle with high-dimensional but small samplesize tabular datasets. One reason is that current weight initialisation methods assume independence between weights, which can be problematic when there are insufficient samples to estimate the model’s parameters accurately. In such small data scenarios, leveraging additional structures can improve the model’s performance and training stability. To address this, we propose GCondNet, a general approach to enhance neural networks by leveraging implicit structures present in tabular data. We create a graph between samples for each data dimension, and utilise Graph Neural Networks (GNNs) for extracting this implicit structure, and for conditioning the parameters of the first layer of an underlying predictor network. By creating many small graphs, GCondNet exploits the data’s highdimensionality, and thus improves the performance of an underlying predictor network. We demonstrate the effectiveness of our method on 9 real-world datasets, where GCondNet outperforms 15 standard and state-of-the-art methods. The results show that GCondNet is a versatile framework for injecting graph-regularisation into various types of neural networks, including MLPs and tabular Transformers.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/2L7WFFG3/Margeloiu et al. - GCondNet A Novel Method for Improving Neural Netw.pdf}
}

@article{margeloiuGCondNetNovelMethoda,
  title = {{{GCondNet}}: {{A Novel Method}} for {{Improving Neural Networks}} on {{Small High-Dimensional Tabular Data}}},
  author = {Margeloiu, Andrei and Lio, Pietro and Simidjievski, Nikola and Jamnik, Mateja},
  abstract = {Neural network models often struggle with high-dimensional but small samplesize tabular datasets. One reason is that current weight initialisation methods assume independence between weights, which can be problematic when there are insufficient samples to estimate the model’s parameters accurately. In such small data scenarios, leveraging additional structures can improve the model’s performance and training stability. To address this, we propose GCondNet, a general approach to enhance neural networks by leveraging implicit structures present in tabular data. We create a graph between samples for each data dimension, and utilise Graph Neural Networks (GNNs) for extracting this implicit structure, and for conditioning the parameters of the first layer of an underlying predictor network. By creating many small graphs, GCondNet exploits the data’s highdimensionality, and thus improves the performance of an underlying predictor network. We demonstrate the effectiveness of our method on 9 real-world datasets, where GCondNet outperforms 15 standard and state-of-the-art methods. The results show that GCondNet is a versatile framework for injecting graph-regularisation into various types of neural networks, including MLPs and tabular Transformers.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/XPDG9SFI/Margeloiu et al. - GCondNet A Novel Method for Improving Neural Netw.pdf}
}

@article{mastropaoloUsingTransferLearning2023,
  title = {Using {{Transfer Learning}} for {{Code-Related Tasks}}},
  author = {Mastropaolo, Antonio and Cooper, Nathan and Palacio, David Nader and Scalabrino, Simone and Poshyvanyk, Denys and Oliveto, Rocco and Bavota, Gabriele},
  date = {2023-04-01},
  journaltitle = {IEEE Transactions on Software Engineering},
  shortjournal = {IIEEE Trans. Software Eng.},
  volume = {49},
  number = {4},
  pages = {1580--1598},
  issn = {0098-5589, 1939-3520, 2326-3881},
  doi = {10.1109/TSE.2022.3183297},
  url = {https://ieeexplore.ieee.org/document/9797060/},
  urldate = {2023-10-02},
  abstract = {Deep learning (DL) techniques have been used to support several code-related tasks such as code summarization and bug-fixing. In particular, pre-trained transformer models are on the rise, also thanks to the excellent results they achieved in Natural Language Processing (NLP) tasks. The basic idea behind these models is to first pre-train them on a generic dataset using a selfsupervised task (e.g., filling masked words in sentences). Then, these models are fine-tuned to support specific tasks of interest (e.g., language translation). A single model can be fine-tuned to support multiple tasks, possibly exploiting the benefits of transfer learning. This means that knowledge acquired to solve a specific task (e.g., language translation) can be useful to boost performance on another task (e.g., sentiment classification). While the benefits of transfer learning have been widely studied in NLP, limited empirical evidence is available when it comes to code-related tasks. In this paper, we assess the performance of the Text-To-Text Transfer Transformer (T5) model in supporting four different code-related tasks: (i) automatic bug-fixing, (ii) injection of code mutants, (iii) generation of assert statements, and (iv) code summarization. We pay particular attention in studying the role played by pre-training and multi-task fine-tuning on the model’s performance. We show that (i) the T5 can achieve better performance as compared to state-of-the-art baselines; and (ii) while pre-training helps the model, not all tasks benefit from a multi-task fine-tuning.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/XBM7PSCD/Mastropaolo et al. - 2023 - Using Transfer Learning for Code-Related Tasks.pdf}
}

@article{Masud2013,
  title = {Classification and {{Adaptive Novel Class Detection}} of {{Feature-Evolving Data Streams}}},
  author = {Masud, Mohammad M and Chen, Qing and Khan, Latifur and Member, Senior},
  date = {2013},
  volume = {25},
  number = {7},
  pages = {1484--1497},
  file = {/Users/ryedida/Zotero/storage/2MG4HCJG/Masud et al. - 2013 - Classification and Adaptive Novel Class Detection of Feature-Evolving Data Streams(2).pdf}
}

@inproceedings{mcallesterPACBayesianModelAveraging1999,
  title = {{{PAC-Bayesian}} Model Averaging},
  booktitle = {Proceedings of the Twelfth Annual Conference on {{Computational}} Learning Theory},
  author = {McAllester, David A.},
  date = {1999-07-06},
  pages = {164--170},
  publisher = {{ACM}},
  location = {{Santa Cruz California USA}},
  doi = {10.1145/307400.307435},
  url = {https://dl.acm.org/doi/10.1145/307400.307435},
  urldate = {2024-01-27},
  abstract = {PAC-Bayesian learning methods combine the informative priors of Bayesian methods with distribution-free PAC guarantees. Building on earlier methods for PAC-Bayesian model selection, this paper presents a method for PACBayesian model averaging. The method constructs an optimized weighted mixture of concepts analogous to a Bayesian posterior distribution. Although the main result is stated for bounded loss, a preliminary analysis for unbounded loss is also given.},
  eventtitle = {{{COLT99}}: {{The}} 12th {{Annual Conference}} on {{Computation Learning Theory}}},
  isbn = {978-1-58113-167-3},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/TY8FRMVH/McAllester - 1999 - PAC-Bayesian model averaging.pdf}
}

@article{mccabeComplexityMeasure1976,
  title = {A {{Complexity Measure}}},
  author = {McCabe, T.J.},
  date = {1976-12},
  journaltitle = {IEEE Transactions on Software Engineering},
  shortjournal = {IIEEE Trans. Software Eng.},
  volume = {SE-2},
  number = {4},
  pages = {308--320},
  issn = {0098-5589},
  doi = {10.1109/TSE.1976.233837},
  url = {http://ieeexplore.ieee.org/document/1702388/},
  urldate = {2023-12-09},
  abstract = {This paper describes a graph-theoretic complexity measure and illustrates how it can be used to manage and control program complexity. The paper first explains how the graph-theory concepts apply and gives an intuitive explanation of the graph concepts in programming terms. The control graphs of several actual Fortran programs are then presented to iUustrate the correlation between intuitive complexity and the graph-theoretic complexity. Several properties of the graphtheoretic complexity are then proved which show, for example, that complexity is independent of physical size (adding or subtracting functional statements leaves complexity unchanged) and complexity depends only on the decision structure of a program.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/SYSVTK8A/McCabe - 1976 - A Complexity Measure.pdf}
}

@unpublished{McInnes2018,
  title = {{{UMAP}}: {{Uniform Manifold Approximation}} and {{Projection}} for {{Dimension Reduction}}},
  author = {McInnes, Leland and Healy, John and Melville, James},
  date = {2018},
  eprint = {1802.03426},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1802.03426},
  abstract = {UMAP (Uniform Manifold Approximation and Projection) is a novel manifold learning technique for dimension reduction. UMAP is constructed from a theoretical framework based in Riemannian geometry and algebraic topology. The result is a practical scalable algorithm that applies to real world data. The UMAP algorithm is competitive with t-SNE for visualization quality, and arguably preserves more of the global structure with superior run time performance. Furthermore, UMAP has no computational restrictions on embedding dimension, making it viable as a general purpose dimension reduction technique for machine learning.},
  file = {/Users/ryedida/Zotero/storage/ZQG7RY4V/McInnes, Healy, Melville - 2018 - UMAP Uniform Manifold Approximation and Projection for Dimension Reduction(2).pdf}
}

@unpublished{Mehta2018,
  title = {A High-Bias, Low-Variance Introduction to {{Machine Learning}} for Physicists},
  author = {Mehta, Pankaj and Bukov, Marin and Wang, Ching-Hao and Day, Alexandre G. R. and Richardson, Clint and Fisher, Charles K. and Schwab, David J.},
  date = {2018},
  eprint = {1803.08823},
  eprinttype = {arxiv},
  issn = {0021-9606},
  doi = {arXiv:1803.08823v1},
  url = {http://arxiv.org/abs/1803.08823},
  abstract = {Machine Learning (ML) is one of the most exciting and dynamic areas of modern research and application. The purpose of this review is to provide an introduction to the core concepts and tools of machine learning in a manner easily understood and intuitive to physicists. The review begins by covering fundamental concepts in ML and modern statistics such as the bias-variance tradeoff, overfitting, regularization, and generalization before moving on to more advanced topics in both supervised and unsupervised learning. Topics covered in the review include ensemble models, deep learning and neural networks, clustering and data visualization, energy-based models (including MaxEnt models and Restricted Boltzmann Machines), and variational methods. Throughout, we emphasize the many natural connections between ML and statistical physics. A notable aspect of the review is the use of Python notebooks to introduce modern ML/statistical packages to readers using physics-inspired datasets (the Ising Model and Monte-Carlo simulations of supersymmetric decays of proton-proton collisions). We conclude with an extended outlook discussing possible uses of machine learning for furthering our understanding of the physical world as well as open problems in ML where physicists maybe able to contribute. (Notebooks are available at https://physics.bu.edu/\textasciitilde pankajm/MLnotebooks.html )},
  file = {/Users/ryedida/Zotero/storage/6WQTBNMG/Mehta et al. - 2018 - A high-bias, low-variance introduction to Machine Learning for physicists(2).pdf}
}

@unpublished{melas-kyriaziYouEvenNeed2021,
  title = {Do {{You Even Need Attention}}? {{A Stack}} of {{Feed-Forward Layers Does Surprisingly Well}} on {{ImageNet}}},
  shorttitle = {Do {{You Even Need Attention}}?},
  author = {Melas-Kyriazi, Luke},
  date = {2021-05-06},
  eprint = {2105.02723},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2105.02723},
  urldate = {2022-01-07},
  abstract = {The strong performance of vision transformers on image classification and other vision tasks is often attributed to the design of their multi-head attention layers. However, the extent to which attention is responsible for this strong performance remains unclear. In this short report, we ask: is the attention layer even necessary? Specifically, we replace the attention layer in a vision transformer with a feed-forward layer applied over the patch dimension. The resulting architecture is simply a series of feed-forward layers applied over the patch and feature dimensions in an alternating fashion. In experiments on ImageNet, this architecture performs surprisingly well: a ViT/DeiT-base-sized model obtains 74.9\textbackslash\% top-1 accuracy, compared to 77.9\textbackslash\% and 79.9\textbackslash\% for ViT and DeiT respectively. These results indicate that aspects of vision transformers other than attention, such as the patch embedding, may be more responsible for their strong performance than previously thought. We hope these results prompt the community to spend more time trying to understand why our current models are as effective as they are.},
  langid = {english},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,simple},
  annotation = {27 citations (Semantic Scholar/arXiv) [2022-01-07]},
  file = {/Users/ryedida/Zotero/storage/YJA9TJBA/Melas-Kyriazi - 2021 - Do You Even Need Attention A Stack of Feed-Forwar.pdf}
}

@inproceedings{Mell2018,
  title = {Towards a Repeated Negotiating Agent That Treats People Individually: {{Cooperation}}, Social Value Orientation, \& {{Machiavellianism}}},
  booktitle = {Proceedings of the 18th {{International Conference}} on {{Intelligent Virtual Agents}}, {{IVA}} 2018},
  author = {Mell, Johnathan and Lucas, Gale and Mozgai, Sharon and Boberg, Jill and Artstein, Ron and Gratch, Jonathan},
  date = {2018},
  pages = {125--132},
  doi = {10.1145/3267851.3267910},
  abstract = {We present the results of a study in which humans negotiate with computerized agents employing varied tactics over a repeated number of economic ultimatum games. We report that certain agents are highly effective against particular classes of humans: several individual difference measures for the human participant are shown to be critical in determining which agents will be successful. Asking for favors works when playing with pro-social people but backfires with more selfish individuals. Further, making poor offers invites punishment from Machiavellian individuals. These factors may be learned once and applied over repeated negotiations, which means user modeling techniques that can detect these differences accurately will be more successful than those that don’t. Our work additionally shows that a significant benefit of cooperation is also present in repeated games—after sufficient interaction. These results have deep significance to agent designers who wish to design agents that are effective in negotiating with a broad swath of real human opponents. Furthermore, it demonstrates the effectiveness of techniques which can reason about negotiation over time.},
  isbn = {978-1-4503-6013-5},
  keywords = {Human-Agent Negotiation,Personality Measures},
  file = {/Users/ryedida/Zotero/storage/6XBZCNH4/Mell et al. - 2018 - Towards a repeated negotiating agent that treats people individually Cooperation, social value orientation, & Ma(2).pdf}
}

@article{menzies10dp,
  title = {Defect {{Prediction}} from {{Static Code Features}}: {{Current Results}}, {{Limitations}}, {{New Approaches}}},
  author = {Menzies, T and Milton, Z and Turhan, B and Cukic, B and Jiang, Y and Bener, A},
  date = {2010},
  journaltitle = {ASE},
  keywords = {Defect prediction,Static code features,WHICH}
}

@article{menzies2003data,
  title = {Data Mining for Very Busy People},
  author = {Menzies, Tim and Hu, Ying},
  date = {2003},
  journaltitle = {Computer},
  volume = {36},
  number = {11},
  pages = {22--29},
  publisher = {{IEEE}}
}

@article{Menzies2005,
  title = {Verification and {{Validation}} and {{Artificial Intelligence}}},
  author = {Menzies, Tim and Pecheur, Charles},
  date = {2005},
  journaltitle = {Advances in Computers},
  volume = {65},
  pages = {153--201},
  issn = {00652458},
  doi = {10.1016/S0065-2458(05)65004-8},
  abstract = {Artificial Intelligence (AI) is useful. AI can deliver more functionality for reduced cost. AI should be used more widely but won't be unless developers can trust adaptive, nondeterministic, or complex AI systems. Verification and validation is one method used by software analysts to gain that trust. AI systems have features that make them hard to check using conventional V\&V methods. Nevertheless, as we show in this chapter, there are enough alternative readily-available methods that enable the V\&V of AI software. © 2005 Elsevier Inc. All rights reserved.},
  isbn = {0120121654},
  issue = {July 2004},
  file = {/Users/ryedida/Zotero/storage/3GS5CYD7/Menzies, Pecheur - 2005 - Verification and Validation and Artificial Intelligence(2).pdf}
}

@article{Mills2017,
  title = {Being {{Sad Is Not Always Bad}}: {{The Influence}} of {{Affect}} on {{Expository Text Comprehension}}},
  author = {Mills, Caitlin and Wu, Jennifer and D’Mello, Sidney},
  date = {2017},
  journaltitle = {Discourse Processes},
  volume = {00},
  number = {00},
  pages = {1--18},
  publisher = {{Routledge}},
  issn = {15326950},
  doi = {10.1080/0163853X.2017.1381059},
  url = {https://doi.org/10.1080/0163853X.2017.1381059},
  isbn = {9014871422},
  file = {/Users/ryedida/Zotero/storage/D4DFPK72/Mills, Wu, D’Mello - 2017 - Being Sad Is Not Always Bad The Influence of Affect on Expository Text Comprehension(2).pdf}
}

@inproceedings{minghimContentbasedTextMapping2006,
  title = {Content-Based Text Mapping Using Multi-Dimensional Projections for Exploration of Document Collections},
  booktitle = {Visualization and {{Data Analysis}} 2006},
  author = {Minghim, Rosane and Paulovich, Fernando Vieira and family=Andrade Lopes, given=Alneu, prefix=de, useprefix=true},
  date = {2006},
  volume = {6060},
  pages = {259--270},
  publisher = {{SPIE}},
  file = {/Users/ryedida/Zotero/storage/YQMNSWXW/Minghim et al_2006_Content-based text mapping using multi-dimensional projections for exploration.pdf}
}

@article{miottoDeepLearningHealthcare2018,
  title = {Deep Learning for Healthcare: Review, Opportunities and Challenges},
  shorttitle = {Deep Learning for Healthcare},
  author = {Miotto, Riccardo and Wang, Fei and Wang, Shuang and Jiang, Xiaoqian and Dudley, Joel T},
  date = {2018-11-27},
  journaltitle = {Briefings in Bioinformatics},
  volume = {19},
  number = {6},
  pages = {1236--1246},
  issn = {1467-5463, 1477-4054},
  doi = {10.1093/bib/bbx044},
  url = {https://academic.oup.com/bib/article/19/6/1236/3800524},
  urldate = {2024-01-10},
  abstract = {Gaining knowledge and actionable insights from complex, high-dimensional and heterogeneous biomedical data remains a key challenge in transforming health care. Various types of data have been emerging in modern biomedical research, including electronic health records, imaging, -omics, sensor data and text, which are complex, heterogeneous, poorly annotated and generally unstructured. Traditional data mining and statistical learning approaches typically need to first perform feature engineering to obtain effective and more robust features from those data, and then build prediction or clustering models on top of them. There are lots of challenges on both steps in a scenario of complicated data and lacking of sufficient domain knowledge. The latest advances in deep learning technologies provide new effective paradigms to obtain end-toend learning models from complex data. In this article, we review the recent literature on applying deep learning technologies to advance the health care domain. Based on the analyzed work, we suggest that deep learning approaches could be the vehicle for translating big biomedical data into improved human health. However, we also note limitations and needs for improved methods development and applications, especially in terms of ease-of-understanding for domain experts and citizen scientists. We discuss such challenges and suggest developing holistic and meaningful interpretable architectures to bridge deep learning models and human interpretability.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/YFZVWPEK/Miotto et al. - 2018 - Deep learning for healthcare review, opportunitie.pdf}
}

@article{Misyrlis2017,
  title = {Spatio-{{Temporal Modeling}} of {{Criminal Activity}}},
  author = {Misyrlis, Michail and Cheung, Chung Ming and Srivastava, Ajitesh and Kannan, Rajgopal and Prasanna, Viktor},
  date = {2017},
  journaltitle = {Proceedings of the 2nd International Workshop on Social Sensing - SocialSens'17},
  pages = {3--8},
  doi = {10.1145/3055601.3055613},
  url = {http://dl.acm.org/citation.cfm?doid=3055601.3055613},
  abstract = {© 2017 ACM. Accurate crime forecasting can allow law enforcement to more effectively plan their resource allocation such as patrol routes and placements. We study the effectiveness of traditional regression approaches in forecasting crime occurrences in Portland, Oregon. We divide the area of interest into equally spaced cells and investigate the spatial autocorrelation between the crime occurrence rates of neighboring cells. We also affempt to use neighboring cells' information in the regression models along with the cell's own time series to enhance the forecast results. Our results show that regression is a promising method that outperforms a moving window averaging method, especially when the future horizon to be predicted increases. However, addition of neighborhood cells decreased the quality of predictions, suggesting that spatial correlation in crime is more complex than geographical neighborhood. We also explore a possibility of connection of criminal activities and popularity of crime incidents in Portland on the Web, and discuss future directions we will take to improve crime prediction.},
  isbn = {9781450349772},
  keywords = {acm reference format,crime prediction,social media crime anycasting,spatial correlation,time series,urban sensing},
  file = {/Users/ryedida/Zotero/storage/NQJSV6AM/Misyrlis et al. - 2017 - Spatio-Temporal Modeling of Criminal Activity(2).pdf}
}

@article{Mnih2015,
  title = {Human-Level Control through Deep Reinforcement Learning},
  author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Rusu, Andrei A. and Veness, Joel and Bellemare, Marc G. and Graves, Alex and Riedmiller, Martin and Fidjeland, Andreas K. and Ostrovski, Georg and Petersen, Stig and Beattie, Charles and Sadik, Amir and Antonoglou, Ioannis and King, Helen and Kumaran, Dharshan and Wierstra, Daan and Legg, Shane and Hassabis, Demis},
  date = {2015},
  journaltitle = {Nature},
  eprint = {25719670},
  eprinttype = {pmid},
  issn = {14764687},
  doi = {10.1038/nature14236},
  abstract = {The theory of reinforcement learning provides a normative account 1 , deeply rooted in psychological 2 and neuroscientific 3 perspectives on animal behaviour, of how agents may optimize their control of an environment. To use reinforcement learning successfully in situations approaching real-world complexity, however, agents are confronted with a difficult task: they must derive efficient representations of the environment from high-dimensional sensory inputs, and use these to generalize past experience to new situations. Remarkably, humans and other animals seem to solve this problem through a harmonious combination of reinforcement learning and hierarchical sensory pro-cessing systems 4,5 , the former evidenced by a wealth of neural data revealing notable parallels between the phasic signals emitted by dopa-minergic neurons and temporal difference reinforcement learning algorithms 3 . While reinforcement learning agents have achieved some successes in a variety of domains 6–8 , their applicability has previously been limited to domains in which useful features can be handcrafted, or to domains with fully observed, low-dimensional state spaces. Here we use recent advances in training deep neural networks 9–11 to develop a novel artificial agent, termed a deep Q-network, that can learn successful policies directly from high-dimensional sensory inputs using end-to-end reinforcement learning. We tested this agent on the challenging domain of classic Atari 2600 games 12},
  isbn = {1476-4687 (Electronic) 0028-0836 (Linking)},
  file = {/Users/ryedida/Zotero/storage/UXEKZPH9/Mnih et al. - 2015 - Human-level control through deep reinforcement learning(2).pdf}
}

@article{Molina2001,
  title = {Image Restoration in Astronomy: {{A}} Bayesian Perspective},
  author = {Molina, Rafael and Núnñez, Jorge and Cortijo, Francisco José and Mateos, Javier},
  date = {2001},
  journaltitle = {IEEE Signal Processing Magazine},
  eprint = {1512.00567},
  eprinttype = {arxiv},
  issn = {10535888},
  doi = {10.1109/79.916318},
  abstract = {When preparing an article on image restoration in astronomy, it is\textbackslash nobvious that some topics have to be dropped to keep the work at\textbackslash nreasonable length. We have decided to concentrate on image and noise\textbackslash nmodels and on the algorithms to find the restoration. Topics like\textbackslash nparameter estimation and stopping rules are also commented on. We start\textbackslash nby describing the Bayesian paradigm and then proceed to study the noise\textbackslash nand blur models used by the astronomical community. Then the prior\textbackslash nmodels used to restore astronomical images are examined. We describe the\textbackslash nalgorithms used to find the restoration for the most common combinations\textbackslash nof degradation and image models. Then we comment on important issues\textbackslash nsuch as acceleration of algorithms, stopping rules, and parameter\textbackslash nestimation. We also comment on the huge amount of information available\textbackslash nto, and made available by, the astronomical community},
  isbn = {1053-5888},
  file = {/Users/ryedida/Zotero/storage/T4G35C5F/Molina et al. - 2001 - Image restoration in astronomy A bayesian perspective(2).pdf}
}

@article{Montero2018,
  title = {Does {{Deep Knowledge Tracing Model Interactions Among Skills}}?},
  author = {Montero, Shirly and Arora, Akshit and Kelly, Sean and Milne, Brent and Mozer, Michael},
  date = {2018},
  abstract = {Personalized learning environments requiring the elicitation of a student's knowledge state have inspired researchers to propose distinct models to understand that knowledge state. Recently, the spotlight has shone on comparisons between traditional, interpretable models such as Bayesian Knowl-edge Tracing (BKT) and complex, opaque neural network models such as Deep Knowledge Tracing (DKT). Although DKT appears to be a powerful predictive model, little ef-fort has been expended to dissect the source of its strength. We begin with the observation that DKT differs from BKT along three dimensions: (1) DKT is a neural network with many free parameters, whereas BKT is a probabilistic model with few free parameters; (2) a single instance of DKT is used to model all skills in a domain, whereas a separate instance of BKT is constructed for each skill; and (3) the in-put to DKT interlaces practice from multiple skills, whereas the input to BKT is separated by skill. We tease apart these three dimensions by constructing versions of DKT which are trained on single skills and which are trained on sequences separated by skill. Exploration of three data sets reveals that dimensions (1) and (3) are critical; dimension (2) is not. Our investigation gives us insight into the structural regularities in the data that DKT is able to exploit but that BKT cannot.},
  keywords = {1,1 knowledge tracing,deep learning,knowledge tracing,online education,personalized learning,sequential modeling},
  file = {/Users/ryedida/Zotero/storage/YNWCKBIU/Montero et al. - 2018 - Does Deep Knowledge Tracing Model Interactions Among Skills(2).pdf}
}

@article{montufar2014number,
  title = {On the Number of Linear Regions of Deep Neural Networks},
  author = {Montufar, Guido F and Pascanu, Razvan and Cho, Kyunghyun and Bengio, Yoshua},
  date = {2014},
  journaltitle = {Advances in neural information processing systems},
  volume = {27},
  keywords = {deep learning,input space partition,maxout,neural network,rectifier},
  file = {/Users/ryedida/Zotero/storage/4EH84M9E/Montúfar et al. - 2014 - On the Number of Linear Regions of Deep Neural Networks(2).pdf}
}

@article{Mooijman2017,
  title = {Resisting {{Temptation}} for the {{Good}} of the {{Group}}: {{Binding Moral Values}} and the {{Moralization}} of {{Self-Control}}},
  author = {Mooijman, Marlon and Meindl, Peter and Dehghani, Morteza and Oyserman, Daphna and Doris, John M and Graham, Jesse},
  date = {2017},
  journaltitle = {Journal of Personality and Social Psychology},
  volume = {115},
  number = {3},
  eprint = {28604018},
  eprinttype = {pmid},
  pages = {585--599},
  issn = {00223514},
  doi = {10.1037/pspp0000149},
  keywords = {10,1037,binding foundations,doi,dx,for its positive,http,impact on moral behavior,moral foundations theory,org,people are to,pspp0000149,self-control has been dubbed,self-control is required if,self-control moralization,supp,supplemental materials,the moral muscle},
  file = {/Users/ryedida/Zotero/storage/3LL47SUX/Mooijman et al. - 2017 - Resisting Temptation for the Good of the Group Binding Moral Values and the Moralization of Self-Control(2).pdf}
}

@book{Mori2019,
  title = {Balancing the Trade-off between Accuracy and Interpretability in Software Defect Prediction},
  author = {Mori, Toshiki and Uchihira, Naoshi},
  date = {2019},
  journaltitle = {Empirical Software Engineering},
  volume = {24},
  number = {2},
  publisher = {{Empirical Software Engineering}},
  issn = {15737616},
  doi = {10.1007/s10664-018-9638-1},
  abstract = {Context: Classification techniques of supervised machine learning have been successfully applied to various domains of practice. When building a predictive model, there are two important criteria: predictive accuracy and interpretability, which generally have a trade-off relationship. In particular, interpretability should be accorded greater emphasis in the domains where the incorporation of expert knowledge into a predictive model is required. Objective: The aim of this research is to propose a new classification model, called superposed naive Bayes (SNB), which transforms a naive Bayes ensemble into a simple naive Bayes model by linear approximation. Method: In order to evaluate the predictive accuracy and interpretability of the proposed method, we conducted a comparative study using well-known classification techniques such as rule-based learners, decision trees, regression models, support vector machines, neural networks, Bayesian learners, and ensemble learners, over 13 real-world public datasets. Results: A trade-off analysis between the accuracy and interpretability of different classification techniques was performed with a scatter plot comparing relative ranks of accuracy with those of interpretability. The experiment results show that the proposed method (SNB) can produce a balanced output that satisfies both accuracy and interpretability criteria. Conclusions: SNB offers a comprehensible predictive model based on a simple and transparent model structure, which can provide an effective way for balancing the trade-off between accuracy and interpretability. © 2018, Springer Science+Business Media, LLC, part of Springer Nature.},
  isbn = {1-06-640189-6},
  pagetotal = {779-825},
  keywords = {Ensemble learning,Interpretability,Model approximation,Naive Bayes classifier,Predictive accuracy,Software defect prediction,Trade-off analysis,Weights of evidence},
  file = {/Users/ryedida/Zotero/storage/42EDVGU8/Mori, Uchihira - 2019 - Balancing the trade-off between accuracy and interpretability in software defect prediction(2).pdf}
}

@inproceedings{movckus1975bayesian,
  title = {On {{Bayesian}} Methods for Seeking the Extremum},
  booktitle = {Optimization Techniques {{IFIP}} Technical Conference: {{Novosibirsk}}, July 1–7, 1974},
  author = {Močkus, Jonas},
  date = {1975},
  pages = {400--404},
  publisher = {{Springer}}
}

@book{munkresTopology2000,
  title = {Topology},
  author = {Munkres, 1930-, James R.},
  date = {2000},
  publisher = {{Upper Saddle River, NJ : Prentice Hall, Inc., [2000]}},
  location = {{Upper Saddle River, NJ}},
  url = {https://catalog.lib.ncsu.edu/catalog/NCSU2221001},
  langid = {english},
  keywords = {Topology},
  file = {/Users/ryedida/Zotero/storage/JRUB2PVM/Munkres_2000_Topology.pdf}
}

@article{Muralidhara2018,
  title = {\#{{Healthy Selfies}}: {{Exploration}} of {{Health Topics}} on {{Instagram}}},
  author = {Muralidhara, Sachin and Paul, Michael J},
  date = {2018},
  journaltitle = {JMIR Public Health and Surveillance},
  volume = {4},
  number = {2},
  eprint = {29959106},
  eprinttype = {pmid},
  pages = {e10150},
  issn = {2369-2960},
  doi = {10.2196/10150},
  url = {http://publichealth.jmir.org/2018/2/e10150/},
  abstract = {BACKGROUND Social media provides a complementary source of information for public health surveillance. The dominate data source for this type of monitoring is the microblogging platform Twitter, which is convenient due to the free availability of public data. Less is known about the utility of other social media platforms, despite their popularity. OBJECTIVE This work aims to characterize the health topics that are prominently discussed in the image-sharing platform Instagram, as a step toward understanding how this data might be used for public health research. METHODS The study uses a topic modeling approach to discover topics in a dataset of 96,426 Instagram posts containing hashtags related to health. We use a polylingual topic model, initially developed for datasets in different natural languages, to model different modalities of data: hashtags, caption words, and image tags automatically extracted using a computer vision tool. RESULTS We identified 47 health-related topics in the data (kappa=.77), covering ten broad categories: acute illness, alternative medicine, chronic illness and pain, diet, exercise, health care \& medicine, mental health, musculoskeletal health and dermatology, sleep, and substance use. The most prevalent topics were related to diet (8,293/96,426; 8.6\% of posts) and exercise (7,328/96,426; 7.6\% of posts). CONCLUSIONS A large and diverse set of health topics are discussed in Instagram. The extracted image tags were generally too coarse and noisy to be used for identifying posts but were in some cases accurate for identifying images relevant to studying diet and substance use. Instagram shows potential as a source of public health information, though limitations in data collection and metadata availability may limit its use in comparison to platforms like Twitter.},
  keywords = {computer vision,image sharing,instagram,public health,social media,topic modeling},
  file = {/Users/ryedida/Zotero/storage/A6KZYG5T/Muralidhara, Paul - 2018 - #Healthy Selfies Exploration of Health Topics on Instagram(2).pdf}
}

@article{Na2018,
  title = {Cascade Adversarial Machine Learning Regularized with a Unified Embedding},
  author = {Na, Taesik and Ko, Jong Hwan and Mukhopadhyay, Saibal},
  date = {2018},
  journaltitle = {6th International Conference on Learning Representations, ICLR 2018 - Conference Track Proceedings},
  eprint = {1708.02582},
  eprinttype = {arxiv},
  pages = {1--16},
  abstract = {Injecting adversarial examples during training, known as adversarial training, can improve robustness against one-step attacks, but not for unknown iterative attacks. To address this challenge, we first show iteratively generated adversarial images easily transfer between networks trained with the same strategy. Inspired by this observation, we propose cascade adversarial training, which transfers the knowledge of the end results of adversarial training. We train a network from scratch by injecting iteratively generated adversarial images crafted from already defended networks in addition to one-step adversarial images from the network being trained. We also propose to utilize embedding space for both classification and low-level (pixel-level) similarity learning to ignore unknown pixel level perturbation. During training, we inject adversarial images without replacing their corresponding clean images and penalize the distance between the two embeddings (clean and adversarial). Experimental results show that cascade adversarial training together with our proposed low-level similarity learning efficiently enhances the robustness against iterative attacks, but at the expense of decreased robustness against one-step attacks. We show that combining those two techniques can also improve robustness under the worst case black box attack scenario.},
  file = {/Users/ryedida/Zotero/storage/XHVSQ9XS/Na, Ko, Mukhopadhyay - 2018 - Cascade adversarial machine learning regularized with a unified embedding(2).pdf}
}

@inproceedings{nair2010rectified,
  title = {Rectified Linear Units Improve Restricted Boltzmann Machines},
  booktitle = {Icml},
  author = {Nair, Vinod and Hinton, Geoffrey E},
  date = {2010}
}

@inproceedings{nairFuncGNNGraphNeural2020,
  title = {{{funcGNN}}: {{A Graph Neural Network Approach}} to {{Program Similarity}}},
  shorttitle = {{{funcGNN}}},
  booktitle = {Proceedings of the 14th {{ACM}} / {{IEEE International Symposium}} on {{Empirical Software Engineering}} and {{Measurement}} ({{ESEM}})},
  author = {Nair, Aravind and Roy, Avijit and Meinke, Karl},
  date = {2020-10-05},
  pages = {1--11},
  publisher = {{ACM}},
  location = {{Bari Italy}},
  doi = {10.1145/3382494.3410675},
  url = {https://dl.acm.org/doi/10.1145/3382494.3410675},
  urldate = {2021-10-23},
  abstract = {Background: Program similarity is a fundamental concept, central to the solution of software engineering tasks such as software plagiarism, clone identification, code refactoring and code search. Accurate similarity estimation between programs requires an in-depth understanding of their structure, semantics and flow. A control flow graph (CFG), is a graphical representation of a program which captures its logical control flow and hence its semantics. A common approach is to estimate program similarity by analysing CFGs using graph similarity measures, e.g. graph edit distance (GED). However, graph edit distance is an NP-hard problem and computationally expensive, making the application of graph similarity techniques to complex software programs impractical. Aim: This study intends to examine the effectiveness of graph neural networks to estimate program similarity, by analysing the associated control flow graphs. Method: We introduce funcGNN 1, which is a graph neural network trained on labeled CFG pairs to predict the GED between unseen program pairs by utilizing an effective embedding vector. To our knowledge, this is the first time graph neural networks have been applied on labeled CFGs for estimating the similarity between highlevel language programs. Results: We demonstrate the effectiveness of funcGNN to estimate the GED between programs and our experimental analysis demonstrates how it achieves a lower error rate (1.94 ×10−3), with faster (23 times faster than the quickest traditional GED approximation method) and better scalability compared with state of the art methods. Conclusion: funcGNN posses the inductive learning ability to infer program structure and generalise to unseen programs. The graph embedding of a program proposed by our methodology could be applied to several related software engineering problems (such as code plagiarism and clone identification) thus opening multiple research directions.},
  eventtitle = {{{ESEM}} '20: {{ACM}} / {{IEEE International Symposium}} on {{Empirical Software Engineering}} and {{Measurement}}},
  isbn = {978-1-4503-7580-1},
  langid = {english},
  annotation = {3 citations (Semantic Scholar/DOI) [2021-10-22]},
  file = {/Users/ryedida/Zotero/storage/WFN8HMQF/Nair et al. - 2020 - funcGNN A Graph Neural Network Approach to Progra.pdf}
}

@article{napier2023empirical,
  title = {An Empirical Study of Text-Based Machine Learning Models for Vulnerability Detection},
  author = {Napier, Kollin and Bhowmik, Tanmay and Wang, Shaowei},
  date = {2023},
  journaltitle = {Empirical Software Engineering},
  volume = {28},
  number = {2},
  pages = {38},
  publisher = {{Springer}}
}

@article{Narayana2018,
  title = {Gesture {{Recognition}} : {{Focus}} on the {{Hands}}},
  author = {Narayana, Pradyumna and Beveridge, J Ross and Draper, Bruce A},
  date = {2018},
  journaltitle = {Cvpr},
  pages = {5235--5244},
  issn = {0022-9776},
  doi = {10.1109/CVPR.2018.00549},
  isbn = {978-1-5386-6420-9},
  file = {/Users/ryedida/Zotero/storage/VZ5NC7G8/Narayana, Beveridge, Draper - 2018 - Gesture Recognition Focus on the Hands(2).pdf}
}

@article{Naselaris,
  title = {Article {{Bayesian Reconstruction}} of {{Natural Images}} from {{Human Brain Activity}}},
  author = {Naselaris, Thomas and Prenger, Ryan J and Kay, Kendrick N and Oliver, Michael and Gallant, Jack L},
  doi = {10.1016/j.neuron.2009.09.006},
  abstract = {Recent studies have used fMRI signals from early visual areas to reconstruct simple geometric patterns. Here, we demonstrate a new Bayesian decoder that uses fMRI signals from early and anterior visual areas to reconstruct complex natural images. Our decoder combines three elements: a structural encoding model that characterizes responses in early visual areas, a semantic encoding model that characterizes responses in anterior visual areas, and prior information about the structure and semantic content of natural images. By combining all these elements, the decoder produces reconstructions that accurately reflect both the spatial structure and semantic category of the objects contained in the observed natural image. Our results show that prior information has a substantial effect on the quality of natural image reconstructions. We also demonstrate that much of the variance in the responses of anterior visual areas to complex natural images is explained by the semantic category of the image alone.}
}

@inproceedings{nemirovskiCezariConvergenceSteepest1978,
  title = {On {{Cezari}}'s Convergence of the Steepest Descent Method for Approximating Saddle Point of Convex-Concave Functions},
  booktitle = {Soviet {{Mathematics}}. {{Doklady}}},
  author = {Nemirovski, Arkadi and Yudin, D.},
  date = {1978},
  volume = {19},
  number = {2},
  pages = {258--269},
  isbn = {0197-6788},
  file = {/Users/ryedida/Zotero/storage/G8U3VHRY/Nemirovski_Yudin_1978_On Cezari's convergence of the steepest descent method for approximating saddle.pdf}
}

@article{neyshaburExploringGeneralizationDeep,
  title = {Exploring {{Generalization}} in {{Deep Learning}}},
  author = {Neyshabur, Behnam and Bhojanapalli, Srinadh and Mcallester, David and Srebro, Nati},
  abstract = {With a goal of understanding what drives generalization in deep networks, we consider several recently suggested explanations, including norm-based control, sharpness and robustness. We study how these measures can ensure generalization, highlighting the importance of scale normalization, and making a connection between sharpness and PAC-Bayes theory. We then investigate how well the measures explain different observed phenomena.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/RQQ99CBE/Neyshabur et al. - Exploring Generalization in Deep Learning.pdf}
}

@online{neyshaburSearchRealInductive2015,
  title = {In {{Search}} of the {{Real Inductive Bias}}: {{On}} the {{Role}} of {{Implicit Regularization}} in {{Deep Learning}}},
  shorttitle = {In {{Search}} of the {{Real Inductive Bias}}},
  author = {Neyshabur, Behnam and Tomioka, Ryota and Srebro, Nathan},
  date = {2015-04-16},
  eprint = {1412.6614},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1412.6614},
  urldate = {2024-01-27},
  abstract = {We present experiments demonstrating that some other form of capacity control, different from network size, plays a central role in learning multi-layer feedforward networks. We argue, partially through analogy to matrix factorization, that this is an inductive bias that can help shed light on deep learning.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/IN7XIJMD/Neyshabur et al. - 2015 - In Search of the Real Inductive Bias On the Role .pdf}
}

@inproceedings{nguyen2017exploring,
  title = {Exploring {{API}} Embedding for {{API}} Usages and Applications},
  booktitle = {2017 {{IEEE}}/{{ACM}} 39th {{International Conference}} on {{Software Engineering}} ({{ICSE}})},
  author = {Nguyen, Trong Duc and Nguyen, Anh Tuan and Phan, Hung Dang and Nguyen, Tien N},
  date = {2017},
  pages = {438--449},
  publisher = {{IEEE}}
}

@article{Nie2018,
  title = {Visualizing {{Deep Neural Networks}} for {{Text Analytics}}},
  author = {Nie, Shaoliang and Healey, Christopher and Padia, Kalpesh and Leeman-Munk, Samuel and Benson, Jordan and Caira, Dave and Sethi, Saratendu and Devarajan, Ravi},
  date = {2018},
  journaltitle = {IEEE Pacific Visualization Symposium},
  volume = {2018-April},
  pages = {180--189},
  issn = {21658773},
  doi = {10.1109/PacificVis.2018.00031},
  abstract = {© 2018 IEEE. Deep neural networks (DNNs) have made tremendous progress in many different areas in recent years. How these networks function internally, however, is often not well understood. Advances in under-standing DNNs will benefit and accelerate the development of the field. We present TNNVis, a visualization system that supports un-derstanding of deep neural networks specifically designed to analyze text. TNNVis focuses on DNNs composed of fully connected and convolutional layers. It integrates visual encodings and interaction techniques chosen specifically for our tasks. The tool allows users to: (1) visually explore DNN models with arbitrary input using a combination of node-link diagrams and matrix representation; (2) quickly identify activation values, weights, and feature map patterns within a network; (3) flexibly focus on visual information of interest with threshold, inspection, insight query, and tooltip operations; (4) discover network activation and training patterns through animation; and (5) compare differences between internal activation patterns for different inputs to the DNN. These functions allow neural network researchers to examine their DNN models from new perspectives, producing insights on how these models function. Clustering and summarization techniques are employed to support large convolutional and fully connected layers. Based on several part of speech models with different structure and size, we present multiple use cases where visualization facilitates an understanding of the models.},
  isbn = {9781538614242},
  keywords = {deep learning,human centered computing,information visualization,machine learning,visualization design},
  file = {/Users/ryedida/Zotero/storage/TR6J5ZEC/Nie et al. - 2018 - Visualizing Deep Neural Networks for Text Analytics(2).pdf}
}

@article{Nishimoto2011,
  title = {Reconstructing Visual Experiences from Brain Activity Evoked by Natural Movies},
  author = {Nishimoto, Shinji and Vu, An T. and Naselaris, Thomas and Benjamini, Yuval and Yu, Bin and Gallant, Jack L.},
  date = {2011},
  journaltitle = {Current Biology},
  eprint = {21945275},
  eprinttype = {pmid},
  issn = {09609822},
  doi = {10.1016/j.cub.2011.08.031},
  abstract = {Quantitative modeling of human brain activity can provide crucial insights about cortical representations [1, 2] and can form the basis for brain decoding devices [3-5]. Recent functional magnetic resonance imaging (fMRI) studies have modeled brain activity elicited by static visual patterns and have reconstructed these patterns from brain activity [6-8]. However, blood oxygen level-dependent (BOLD) signals measured via fMRI are very slow [9], so it has been difficult to model brain activity elicited by dynamic stimuli such as natural movies. Here we present a new motion-energy [10, 11] encoding model that largely overcomes this limitation. The model describes fast visual information and slow hemodynamics by separate components. We recorded BOLD signals in occipitotemporal visual cortex of human subjects who watched natural movies and fit the model separately to individual voxels. Visualization of the fit models reveals how early visual areas represent the information in movies. To demonstrate the power of our approach, we also constructed a Bayesian decoder [8] by combining estimated encoding models with a sampled natural movie prior. The decoder provides remarkable reconstructions of the viewed movies. These results demonstrate that dynamic brain activity measured under naturalistic conditions can be decoded using current fMRI technology. © 2011 Elsevier Ltd. All rights reserved.},
  isbn = {0960-9822}
}

@article{niuComparingPretrainedModels2023,
  title = {Comparing the {{Pretrained Models}} of {{Source Code}} by {{Re-pretraining Under}} a {{Unified Setup}}},
  author = {Niu, Changan and Li, Chuanyi and Ng, Vincent and Luo, Bin},
  date = {2023},
  journaltitle = {IEEE Transactions on Neural Networks and Learning Systems},
  shortjournal = {IEEE Trans. Neural Netw. Learning Syst.},
  pages = {1--11},
  issn = {2162-237X, 2162-2388},
  doi = {10.1109/TNNLS.2023.3308595},
  url = {https://ieeexplore.ieee.org/document/10246327/},
  urldate = {2023-10-10},
  abstract = {Recent years have seen the successful application of large pretrained models of source code (CodePTMs) to code representation learning, which have taken the field of software engineering (SE) from task-specific solutions to task-agnostic generic models. By the remarkable results, CodePTMs are seen as a promising direction in both academia and industry. While a number of CodePTMs have been proposed, they are often not directly comparable because they differ in experimental setups such as pretraining dataset, model size, evaluation tasks, and datasets. In this article, we first review the experimental setup used in previous work and propose a standardized setup to facilitate fair comparisons among CodePTMs to explore the impacts of their pretraining tasks. Then, under the standardized setup, we re-pretrain CodePTMs using the same model architecture, input modalities, and pretraining tasks, as they declared and fine-tune each model on each evaluation SE task for evaluating. Finally, we present the experimental results and make a comprehensive discussion on the relative strength and weakness of different pretraining tasks with respect to each SE task. We hope our view can inspire and advance the future study of more powerful CodePTMs.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/GPTK2PIN/Niu et al. - 2023 - Comparing the Pretrained Models of Source Code by .pdf}
}

@unpublished{Oh2019,
  title = {{{Speech2Face}}: {{Learning}} the {{Face Behind}} a {{Voice}}},
  author = {Oh, Tae-Hyun and Dekel, Tali and Kim, Changil and Mosseri, Inbar and Freeman, William T. and Rubinstein, Michael and Matusik, Wojciech},
  date = {2019},
  eprint = {1905.09773},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1905.09773},
  abstract = {How much can we infer about a person's looks from the way they speak? In this paper, we study the task of reconstructing a facial image of a person from a short audio recording of that person speaking. We design and train a deep neural network to perform this task using millions of natural Internet/YouTube videos of people speaking. During training, our model learns voice-face correlations that allow it to produce images that capture various physical attributes of the speakers such as age, gender and ethnicity. This is done in a self-supervised manner, by utilizing the natural co-occurrence of faces and speech in Internet videos, without the need to model attributes explicitly. We evaluate and numerically quantify how--and in what manner--our Speech2Face reconstructions, obtained directly from audio, resemble the true face images of the speakers.},
  file = {/Users/ryedida/Zotero/storage/Y8YWLCPQ/Oh et al. - 2019 - Speech2Face Learning the Face Behind a Voice(2).pdf}
}

@inproceedings{pacula2012hyperparameter,
  title = {Hyperparameter Tuning in Bandit-Based Adaptive Operator Selection},
  booktitle = {Applications of Evolutionary Computation: {{EvoApplications}} 2012: {{EvoCOMNET}}, {{EvoCOMPLEX}}, {{EvoFIN}}, {{EvoGAMES}}, {{EvoHOT}}, {{EvoIASP}}, {{EvoNUM}}, {{EvoPAR}}, {{EvoRISK}}, {{EvoSTIM}}, and {{EvoSTOC}}, Málaga, Spain, April 11-13, 2012, Proceedings},
  author = {Pacula, Maciej and Ansel, Jason and Amarasinghe, Saman and O’Reilly, Una-May},
  date = {2012},
  pages = {73--82},
  publisher = {{Springer}}
}

@article{Pan2012,
  title = {In {{Human Interaction}}},
  author = {Pan, Wei and Dong, Wen and Cebrian, Mauel and Kim, Taemie and Fowler, James H and Pentland, Alex Sandy},
  date = {2012},
  journaltitle = {IEE Signal Processing Magazine},
  pages = {77--86},
  issn = {1053-5888},
  doi = {10.1109/MSP.2011.942737},
  abstract = {How can we model influence between individuals in a social system, even when the network of interactions is unknown? In this article, we review the literature on the “influence model,” which utilizes independent time series to esti- mate how much the state of one actor affects the state of another actor in the system. We extend this model to incorpo- rate dynamical parameters that allow us to infer how influ- ence changes over time, and we provide three examples of how this model can be applied to simulated and real data. The results show that the model can recover known estimates of influence, it generates results that are consistent with other measures of social networks, and it allows us to uncover important shifts in the way states may be transmitted between actors at different points in time.},
  isbn = {1053-5888},
  issue = {February},
  file = {/Users/ryedida/Zotero/storage/XUT7PD7W/Pan et al. - 2012 - in Human Interaction(2).pdf}
}

@article{pangImprovingAdversarialRobustness,
  title = {Improving {{Adversarial Robustness}} via {{Promoting Ensemble Diversity}}},
  author = {Pang, Tianyu and Xu, Kun and Du, Chao and Chen, Ning and Zhu, Jun},
  pages = {10},
  abstract = {Though deep neural networks have achieved significant progress on various tasks, often enhanced by model ensemble, existing high-performance models can be vulnerable to adversarial attacks. Many efforts have been devoted to enhancing the robustness of individual networks and then constructing a straightforward ensemble, e.g., by directly averaging the outputs, which ignores the interaction among networks. This paper presents a new method that explores the interaction among individual networks to improve robustness for ensemble models. Technically, we define a new notion of ensemble diversity in the adversarial setting as the diversity among non-maximal predictions of individual members, and present an adaptive diversity promoting (ADP) regularizer to encourage the diversity, which leads to globally better robustness for the ensemble by making adversarial examples difficult to transfer among individual members. Our method is computationally efficient and compatible with the defense methods acting on individual networks. Empirical results on various datasets verify that our method can improve adversarial robustness while maintaining state-of-the-art accuracy on normal examples.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/TZFWW7ZX/Pang et al. - Improving Adversarial Robustness via Promoting Ens.pdf}
}

@unpublished{pangMixupInferenceBetter2020,
  title = {Mixup {{Inference}}: {{Better Exploiting Mixup}} to {{Defend Adversarial Attacks}}},
  shorttitle = {Mixup {{Inference}}},
  author = {Pang, Tianyu and Xu, Kun and Zhu, Jun},
  date = {2020-02-20},
  eprint = {1909.11515},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1909.11515},
  urldate = {2021-03-29},
  abstract = {It has been widely recognized that adversarial examples can be easily crafted to fool deep networks, which mainly root from the locally unreasonable behavior nearby input examples. Applying mixup in training provides an effective mechanism to improve generalization performance and model robustness against adversarial perturbations, which introduces the globally linear behavior in-between training examples. However, in previous work, the mixup-trained models only passively defend adversarial attacks in inference by directly classifying the inputs, where the induced global linearity is not well exploited. Namely, since the locality of the adversarial perturbations, it would be more efficient to actively break the locality via the globality of the model predictions. Inspired by simple geometric intuition, we develop an inference principle, named mixup inference (MI), for mixup-trained models. MI mixups the input with other random clean samples, which can shrink and transfer the equivalent perturbation if the input is adversarial. Our experiments on CIFAR-10 and CIFAR-100 demonstrate that MI can further improve the adversarial robustness for the models trained by mixup and its variants.},
  langid = {english},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Statistics - Machine Learning},
  annotation = {25 citations (Semantic Scholar/arXiv) [2021-03-29]},
  file = {/Users/ryedida/Zotero/storage/28MVNSSQ/Pang et al. - 2020 - Mixup Inference Better Exploiting Mixup to Defend.pdf}
}

@unpublished{pangRethinkingSoftmaxCrossEntropy2020,
  title = {Rethinking {{Softmax Cross-Entropy Loss}} for {{Adversarial Robustness}}},
  author = {Pang, Tianyu and Xu, Kun and Dong, Yinpeng and Du, Chao and Chen, Ning and Zhu, Jun},
  date = {2020-02-20},
  eprint = {1905.10626},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1905.10626},
  urldate = {2021-04-05},
  abstract = {Previous work shows that adversarially robust generalization requires larger sample complexity, and the same dataset, e.g., CIFAR-10, which enables good standard accuracy may not suffice to train robust models. Since collecting new training data could be costly, we instead focus on inducing locally dense sample distribution, i.e., high sample density in the feature space which could lead to locally sufficient samples for robust learning. We first formally show that the softmax cross-entropy (SCE) loss and its variants induce inappropriate sample density distributions in the feature space, which inspires us to design appropriate training objectives. Specifically, we propose the Max-Mahalanobis center (MMC) loss to create highdensity regions for better robustness. It encourages the learned features to gather around the preset class centers with optimal inter-class dispersion. Comparing to the SCE loss and its variants, we empirically demonstrate that applying the MMC loss can significantly improve robustness even under strong adaptive attacks, while keeping state-of-the-art accuracy on clean inputs with little extra computation.},
  langid = {english},
  keywords = {Computer Science - Cryptography and Security,Computer Science - Machine Learning,Statistics - Machine Learning},
  annotation = {33 citations (Semantic Scholar/arXiv) [2021-04-05]},
  file = {/Users/ryedida/Zotero/storage/RP47MVYJ/Pang et al. - 2020 - Rethinking Softmax Cross-Entropy Loss for Adversar.pdf}
}

@book{papakroni2013data,
  title = {Data Carving: {{Identifying}} and Removing Irrelevancies in the Data},
  author = {Papakroni, Vasil},
  date = {2013},
  publisher = {{West Virginia University}}
}

@unpublished{Parr2018,
  title = {The {{Matrix Calculus You Need For Deep Learning}}},
  author = {Parr, Terence and Howard, Jeremy},
  date = {2018},
  volume = {2018},
  eprint = {1802.01528},
  eprinttype = {arxiv},
  pages = {1--33},
  url = {http://arxiv.org/abs/1802.01528},
  abstract = {This paper is an attempt to explain all the matrix calculus you need in order to understand the training of deep neural networks. We assume no math knowledge beyond what you learned in calculus 1, and provide links to help you refresh the necessary math where needed. Note that you do not need to understand this material before you start learning to train and use deep learning in practice; rather, this material is for those who are already familiar with the basics of neural networks, and wish to deepen their understanding of the underlying math. Don't worry if you get stuck at some point along the way---just go back and reread the previous section, and try writing down and working through some examples. And if you're still stuck, we're happy to answer your questions in the Theory category at forums.fast.ai. Note: There is a reference section at the end of the paper summarizing all the key matrix calculus rules and terminology discussed here. See related articles at http://explained.ai}
}

@article{Pei2017,
  title = {{{DeepXplore}}: {{Automated Whitebox Testing}} of {{Deep Learning Systems}}},
  author = {Pei, Kexin and Cao, Yinzhi and Yang, Junfeng and Jana, Suman},
  date = {2017},
  journaltitle = {SOSP 2017 - Proceedings of the 26th ACM Symposium on Operating Systems Principles},
  eprint = {1705.06640},
  eprinttype = {arxiv},
  pages = {1--18},
  doi = {10.1145/3132747.3132785},
  abstract = {Deep learning (DL) systems are increasingly deployed in safety- and security-critical domains including self-driving cars and malware detection, where the correctness and predictability of a system’s behavior for corner case inputs are of great importance. Existing DL testing depends heavily on manually labeled data and therefore often fails to expose erroneous behaviors for rare inputs. We design, implement, and evaluate DeepXplore, the first whitebox framework for systematically testing real-world DL systems. First, we introduce neuron coverage for systematically measuring the parts of a DL system exercised by test inputs. Next, we leverage multiple DL systems with similar functionality as cross-referencing oracles to avoid manual checking. Finally, we demonstrate how finding inputs for DL systems that both trigger many differential behaviors and achieve high neuron coverage can be represented as a joint optimization problem and solved efficiently using gradient-based search techniques. DeepXplore efficiently finds thousands of incorrect corner case behaviors (e.g., self-driving cars crashing into guard rails and malware masquerading as benign software) in state-of-the-art DL models with thousands of neurons trained on five popular datasets including ImageNet and Udacity self-driving challenge data. For all tested DL models, on average, DeepXplore generated one test input demonstrating incorrect behavior within one second while running only on a commodity laptop. We further show that the test inputs generated by DeepXplore can also be used to retrain the corresponding DL model to improve the model’s accuracy by up to 3\%.},
  isbn = {9781450350853},
  keywords = {Deep learning testing,Differential testing,Whitebox testing},
  file = {/Users/ryedida/Zotero/storage/Z3NAFDT6/Pei et al. - 2017 - DeepXplore Automated Whitebox Testing of Deep Learning Systems(2).pdf}
}

@article{Pelanek2016,
  title = {Applications of the {{Elo}} Rating System in Adaptive Educational Systems},
  author = {Pelánek, Radek},
  date = {2016},
  journaltitle = {Computers and Education},
  issn = {03601315},
  doi = {10.1016/j.compedu.2016.03.017},
  abstract = {The Elo rating system was originally developed for rating chess players, nowadays it is widely used for ranking players of many other games. The system can be used in educational systems when we interpret student's answer to an item as a match between the student and the item. In this way we can easily dynamically estimate the skill of students and difficulty of items. We provide a systematic overview of different variants of the Elo rating system and their application in education. We compare the Elo rating system to alternative methods and describe a specific case study (an adaptive practice of geography facts) to illustrate the application of the Elo rating system in education. We argue that the Elo rating system is simple, robust, and effective and thus suitable for use in the development of adaptive educational systems. We provide specific guidelines for such applications.}
}

@article{Pena-Ayala2014,
  title = {Educational Data Mining: {{A}} Survey and a Data Mining-Based Analysis of Recent Works},
  author = {Peña-Ayala, Alejandro},
  date = {2014},
  journaltitle = {Expert Systems with Applications},
  eprint = {91971052},
  eprinttype = {pmid},
  issn = {09574174},
  doi = {10.1016/j.eswa.2013.08.042},
  abstract = {This review pursues a twofold goal, the first is to preserve and enhance the chronicles of recent educational data mining (EDM) advances development; the second is to organize, analyze, and discuss the content of the review based on the outcomes produced by a data mining (DM) approach. Thus, as result of the selection and analysis of 240 EDM works, an EDM work profile was compiled to describe 222 EDM approaches and 18 tools. A profile of the EDM works was organized as a raw data base, which was transformed into an ad-hoc data base suitable to be mined. As result of the execution of statistical and clustering processes, a set of educational functionalities was found, a realistic pattern of EDM approaches was discovered, and two patterns of value-instances to depict EDM approaches based on descriptive and predictive models were identified. One key finding is: most of the EDM approaches are ground on a basic set composed by three kinds of educational systems, disciplines, tasks, methods, and algorithms each. The review concludes with a snapshot of the surveyed EDM works, and provides an analysis of the EDM strengths, weakness, opportunities, and threats, whose factors represent, in a sense, future work to be fulfilled. © 2013 Elsevier Ltd. All rights reserved.},
  isbn = {0957-4174}
}

@article{pengIntegratingTreePath,
  title = {Integrating {{Tree Path}} in {{Transformer}} for {{Code Representation}}},
  author = {Peng, Han and Li, Ge and Wang, Wenhan and Zhao, Yunfei and Jin, Zhi},
  pages = {12},
  abstract = {Learning distributed representation of source code requires modelling its syntax and semantics. Recent state-of-the-art models leverage highly structured source code representations, such as the syntax trees and paths therein. In this paper, we investigate two representative path encoding methods shown in previous research work and integrate them into the attention module of Transformer. We draw inspiration from the ideas of positional encoding and modify them to incorporate these path encoding. Specifically, we encode both the pairwise path between tokens of source code and the path from the leaf node to the tree root for each token in the syntax tree. We explore the interaction between these two kinds of paths by integrating them into the unified Transformer framework. The detailed empirical study for path encoding methods also leads to our novel state-of-theart representation model TPTrans, which finally outperforms strong baselines. Extensive experiments and ablation studies on code summarization across four different languages demonstrate the effectiveness of our approaches. We release our code at https://github.com/AwdHanPeng/TPTrans.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/ZLDSPB9N/Peng et al. - Integrating Tree Path in Transformer for Code Repr.pdf}
}

@article{Pennell2014,
  title = {Normalization of Informal Text},
  author = {Pennell, Deana L. and Liu, Yang},
  date = {2014},
  journaltitle = {Computer Speech and Language},
  volume = {28},
  number = {1},
  pages = {256--277},
  publisher = {{Elsevier Ltd}},
  issn = {08852308},
  doi = {10.1016/j.csl.2013.07.001},
  url = {http://dx.doi.org/10.1016/j.csl.2013.07.001},
  abstract = {This paper describes a noisy-channel approach for the normalization of informal text, such as that found in emails, chat rooms, and SMS messages. In particular, we introduce two character-level methods for the abbreviation modeling aspect of the noisy channel model: a statistical classifier using language-based features to decide whether a character is likely to be removed from a word, and a character-level machine translation model. A two-phase approach is used; in the first stage the possible candidates are generated using the selected abbreviation model and in the second stage we choose the best candidate by decoding using a language model. Overall we find that this approach works well and is on par with current research in the field. © 2013 Elsevier Ltd.},
  keywords = {NLP applications,Noisy text,Text normalization}
}

@report{Petersen2012,
  title = {The {{Matrix Cookbook}}},
  author = {Petersen, Kaare Brandt and Pedersen, Michael Syskind},
  date = {2012}
}

@article{pezeshkiGradientStarvationLearning,
  title = {Gradient {{Starvation}}: {{A Learning Proclivity}} in {{Neural Networks}}},
  author = {Pezeshki, Mohammad and Kaba, Sékou-Oumar and Bengio, Yoshua and Courville, Aaron and Precup, Doina and Lajoie, Guillaume},
  pages = {17},
  abstract = {We identify and formalize a fundamental gradient descent phenomenon leading to a learning proclivity in over-parameterized neural networks. Gradient Starvation arises when cross-entropy loss is minimized by capturing only a subset of features relevant for the task, despite the presence of other predictive features that fail to be discovered. This work provides a theoretical explanation for the emergence of such feature imbalances in neural networks. Using tools from Dynamical Systems theory, we identify simple properties of learning dynamics during gradient descent that lead to this imbalance, and prove that such a situation can be expected given certain statistical structure in training data. Based on our proposed formalism, we develop guarantees for a novel but simple regularization method aimed at decoupling feature learning dynamics, improving accuracy and robustness in cases hindered by gradient starvation. We illustrate our findings with simple and realworld out-of-distribution (OOD) generalization experiments.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/23NNJPC3/Pezeshki et al. - Gradient Starvation A Learning Proclivity in Neur.pdf}
}

@article{pfistererYAHPOGymEfficient,
  title = {{{YAHPO Gym}} - {{An Efficient Multi-Objective Multi-Fidelity Benchmark}} for {{Hyperparameter Optimization}}},
  author = {Pfisterer, Florian and Schneider, Lennart and Moosbauer, Julia and Binder, Martin and Bischl, Bernd},
  abstract = {When developing and analyzing new hyperparameter optimization methods, it is vital to empirically evaluate and compare them on well-curated benchmark suites. In this work, we propose a new set of challenging and relevant benchmark problems motivated by desirable properties and requirements for such benchmarks. Our new surrogate-based benchmark collection consists of 14 scenarios that in total constitute over 700 multi-fidelity hyperparameter optimization problems, which all enable multi-objective hyperparameter optimization. Furthermore, we empirically compare surrogate-based benchmarks to the more widely-used tabular benchmarks, and demonstrate that the latter may produce unfaithful results regarding the performance ranking of HPO methods. We examine and compare our benchmark collection with respect to defined requirements and propose a single-objective as well as a multi-objective benchmark suite on which we compare 7 single-objective and 7 multi-objective optimizers in a benchmark experiment. Our software is available at [https://github.com/slds-lmu/yahpo\_gym].},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/8SKHLUHT/Pfisterer et al. - YAHPO Gym - An Efficient Multi-Objective Multi-Fid.pdf}
}

@unpublished{Piech2015,
  title = {Deep {{Knowledge Tracing}}},
  author = {Piech, Chris and Spencer, Jonathan and Huang, Jonathan and Ganguli, Surya and Sahami, Mehran and Guibas, Leonidas and Sohl-Dickstein, Jascha},
  date = {2015},
  eprint = {1506.05908},
  eprinttype = {arxiv},
  issn = {10495258},
  abstract = {Knowledge tracing---where a machine models the knowledge of a student as they interact with coursework---is a well established problem in computer supported education. Though effectively modeling student knowledge would have high educational impact, the task has many inherent challenges. In this paper we explore the utility of using Recurrent Neural Networks (RNNs) to model student learning. The RNN family of models have important advantages over previous methods in that they do not require the explicit encoding of human domain knowledge, and can capture more complex representations of student knowledge. Using neural networks results in substantial improvements in prediction performance on a range of knowledge tracing datasets. Moreover the learned model can be used for intelligent curriculum design and allows straightforward interpretation and discovery of structure in student tasks. These results suggest a promising new line of research for knowledge tracing and an exemplary application task for RNNs.},
  file = {/Users/ryedida/Zotero/storage/7HZ9J59S/Piech et al. - 2015 - Deep Knowledge Tracing(2).pdf}
}

@unpublished{Popowich2018,
  title = {Self-{{Attention}}: {{A Better Building Block}} for {{Sentiment Analysis Neural Network Classifiers}}},
  author = {Popowich, Fred},
  date = {2018},
  eprint = {1812.07860v1},
  eprinttype = {arxiv},
  file = {/Users/ryedida/Zotero/storage/D9UU6KMV/Popowich - 2018 - Self-Attention A Better Building Block for Sentiment Analysis Neural Network Classifiers(2).pdf}
}

@online{powerGrokkingGeneralizationOverfitting2022a,
  title = {Grokking: {{Generalization Beyond Overfitting}} on {{Small Algorithmic Datasets}}},
  shorttitle = {Grokking},
  author = {Power, Alethea and Burda, Yuri and Edwards, Harri and Babuschkin, Igor and Misra, Vedant},
  date = {2022-01-06},
  eprint = {2201.02177},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2201.02177},
  urldate = {2023-09-02},
  abstract = {In this paper we propose to study generalization of neural networks on small algorithmically generated datasets. In this setting, questions about data efficiency, memorization, generalization, and speed of learning can be studied in great detail. In some situations we show that neural networks learn through a process of “grokking” a pattern in the data, improving generalization performance from random chance level to perfect generalization, and that this improvement in generalization can happen well past the point of overfitting. We also study generalization as a function of dataset size and find that smaller datasets require increasing amounts of optimization for generalization. We argue that these datasets provide a fertile ground for studying a poorly understood aspect of deep learning: generalization of overparametrized neural networks beyond memorization of the finite training dataset.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/RFAK83FU/Power et al. - 2022 - Grokking Generalization Beyond Overfitting on Sma.pdf}
}

@unpublished{prennerMakingMostSmall2021,
  title = {Making the Most of Small {{Software Engineering}} Datasets with Modern Machine Learning},
  author = {Prenner, Julian Aron and Robbes, Romain},
  date = {2021-06-29},
  eprint = {2106.15209},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2106.15209},
  urldate = {2021-12-21},
  abstract = {This paper provides a starting point for Software Engineering (SE) researchers and practitioners faced with the problem of training machine learning models on small datasets. Due to the high costs associated with labeling data, in Software Engineering, there exist many small ({$<$} 1 000 samples) and medium-sized ({$<$} 100 000 samples) datasets. While deep learning has set the state of the art in many machine learning tasks, it is only recently that it has proven effective on small-sized datasets, primarily thanks to pre-training, a semi-supervised learning technique that leverages abundant unlabelled data alongside scarce labelled data. In this work, we evaluate pre-trained Transformer models on a selection of 13 smaller datasets from the SE literature, covering both, source code and natural language. Our results suggest that pre-trained Transformers are competitive and in some cases superior to previous models, especially for tasks involving natural language; whereas for source code tasks, in particular for very small datasets, traditional machine learning methods often has the edge. In addition, we experiment with several techniques that ought to aid training on small datasets, including active learning, data augmentation, soft labels, self-training and intermediate-task fine-tuning, and issue recommendations on when they are effective. We also release all the data, scripts, and most importantly pre-trained models for the community to reuse on their own datasets.},
  langid = {english},
  keywords = {Computer Science - Software Engineering},
  annotation = {0 citations (Semantic Scholar/arXiv) [2021-12-21]},
  file = {/Users/ryedida/Zotero/storage/W567DCGY/Prenner and Robbes - 2021 - Making the most of small Software Engineering data.pdf}
}

@article{Pulina2010,
  title = {An Abstraction-Refinement Approach to Verification of Artificial Neural Networks},
  author = {Pulina, Luca and Tacchella, Armando},
  date = {2010},
  journaltitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  volume = {6174 LNCS},
  pages = {243--257},
  issn = {03029743},
  doi = {10.1007/978-3-642-14295-6_24},
  abstract = {A key problem in the adoption of artificial neural networks in safety-related applications is that misbehaviors can be hardly ruled out with traditional analytical or probabilistic techniques. In this paper we focus on specific networks known as Multi-Layer Perceptrons (MLPs), and we propose a solution to verify their safety using abstractions to Boolean combinations of linear arithmetic constraints. We show that our abstractions are consistent, i.e., whenever the abstract MLP is declared to be safe, the same holds for the concrete one. Spurious counterexamples, on the other hand, trigger refinements and can be leveraged to automate the correction of misbehaviors. We describe an implementation of our approach based on the HySAT solver, detailing the abstraction-refinement process and the automated correction strategy. Finally, we present experimental results confirming the feasibility of our approach on a realistic case study. © 2010 Springer-Verlag.},
  isbn = {364214294X},
  file = {/Users/ryedida/Zotero/storage/NGGC8K9B/Pulina, Tacchella - 2010 - An abstraction-refinement approach to verification of artificial neural networks(2).pdf}
}

@unpublished{qi2017pointnet++,
  title = {Pointnet++: {{Deep}} Hierarchical Feature Learning on Point Sets in a Metric Space},
  author = {Qi, Charles R and Yi, Li and Su, Hao and Guibas, Leonidas J},
  date = {2017},
  eprint = {1706.02413},
  eprinttype = {arxiv}
}

@article{qianAreMyDeep,
  title = {Are {{My Deep Learning Systems Fair}}? {{An Empirical Study}} of {{Fixed-Seed Training}}},
  author = {Qian, Shangshu and Pham, Hung Viet and Lutellier, Thibaud and Hu, Zeou and Kim, Jungwon and Tan, Lin and Yu, Yaoliang and Chen, Jiahao and Shah, Sameena},
  pages = {17},
  abstract = {Deep learning (DL) systems have been gaining popularity in critical tasks such as credit evaluation and crime prediction. Such systems demand fairness. Recent work shows that DL software implementations introduce variance: identical DL training runs (i.e., identical network, data, configuration, software, and hardware) with a fixed seed produce different models. Such variance could make DL models and networks violate fairness compliance laws, resulting in negative social impact. In this paper, we conduct the first empirical study to quantify the impact of software implementation on the fairness and its variance of DL systems. Our study of 22 mitigation techniques and five baselines reveals up to 12.6\% fairness variance across identical training runs with identical seeds. In addition, most debiasing algorithms have a negative impact on the model such as reducing model accuracy, increasing fairness variance, or increasing accuracy variance. Our literature survey shows that while fairness is gaining popularity in artificial intelligence (AI) related conferences, only 34.4\% of the papers use multiple identical training runs to evaluate their approach, raising concerns about their results’ validity. We call for better fairness evaluation and testing protocols to improve fairness and fairness variance of DL systems as well as DL research validity and reproducibility at large.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/5C5FYPRM/Qian et al. - Are My Deep Learning Systems Fair An Empirical St.pdf}
}

@article{quEvaluatingNetworkEmbedding2021,
  title = {Evaluating Network Embedding Techniques’ Performances in Software Bug Prediction},
  author = {Qu, Yu and Yin, Heng},
  date = {2021-07},
  journaltitle = {Empirical Software Engineering},
  shortjournal = {Empir Software Eng},
  volume = {26},
  number = {4},
  pages = {60},
  issn = {1382-3256, 1573-7616},
  doi = {10.1007/s10664-021-09965-5},
  url = {https://link.springer.com/10.1007/s10664-021-09965-5},
  urldate = {2021-05-17},
  abstract = {Software bug prediction techniques can be very helpful in testing and code inspection. Over the past decade, network measures have been successfully used in bug prediction. Following the same intuition, recently, researchers started using network embedding techniques in bug prediction. However, existing studies only evaluated the Skip-gram and CBOW models with random walk. Considering network embedding is a fast-developing research direction, it is important to evaluate other network embedding techniques’ performances in bug prediction. Moreover, existing studies have not investigated the application and performance of network embedding in effort-aware bug prediction, which is thought to be a more realistic scenario that evaluates the cost effectiveness of bug prediction models. In this paper, we conduct an extensive empirical study to evaluate network embedding algorithms in bug prediction by utilizing and extending node2defect, a newly proposed bug prediction model that combines the embedded vectors with traditional software engineering metrics through concatenation. Experiments are conducted based on seven network embedding algorithms, two effort-aware models, and 13 open-source Java systems. Experimental results show that node2defect outperforms traditional metrics by + 14.64\% in terms of MCC score, and by + 7.51\% to + 16.57\% in effort-aware bug prediction. More interestingly, when combined with CBS+, the embedded vectors alone can achieve the best performance. Among different network embedding algorithms, the newly proposed algorithm ProNE has the best performance.},
  langid = {english},
  annotation = {0 citations (Semantic Scholar/DOI) [2021-05-17]},
  file = {/Users/ryedida/Zotero/storage/E43SW5U4/Qu and Yin - 2021 - Evaluating network embedding techniques’ performan.pdf}
}

@article{Quinlan1986,
  title = {Induction of {{Decision Trees}}},
  author = {Quinlan, J.R},
  date = {1986},
  journaltitle = {Machine Learning},
  eprint = {17050186},
  eprinttype = {pmid},
  pages = {81--106},
  issn = {15730565},
  doi = {10.1023/A:1022643204877},
  isbn = {978-1-85233-086-6},
  keywords = {classification,decision trees,expert,induction,information theory,knowledge acquisition}
}

@inproceedings{quNode2defectUsingNetwork2018,
  title = {Node2defect: Using Network Embedding to Improve Software Defect Prediction},
  shorttitle = {Node2defect},
  booktitle = {Proceedings of the 33rd {{ACM}}/{{IEEE International Conference}} on {{Automated Software Engineering}}},
  author = {Qu, Yu and Liu, Ting and Chi, Jianlei and Jin, Yangxu and Cui, Di and He, Ancheng and Zheng, Qinghua},
  date = {2018-09-03},
  pages = {844--849},
  publisher = {{ACM}},
  location = {{Montpellier France}},
  doi = {10.1145/3238147.3240469},
  url = {https://dl.acm.org/doi/10.1145/3238147.3240469},
  urldate = {2021-05-17},
  abstract = {Network measures have been proved to be useful in predicting software defects. Leveraging the dependency relationships between software modules, network measures can capture various structural features of software systems. However, existing studies have relied on user-defined network measures (e.g., degree statistics or centrality metrics), which are inflexible and require high computation cost, to describe the structural features. In this paper, we propose a new method called node2defect which uses a newly proposed network embedding technique, node2vec, to automatically learn to encode dependency network structure into low-dimensional vector spaces to improve software defect prediction. Specifically, we firstly construct a program’s Class Dependency Network. Then node2vec is used to automatically learn structural features of the network. After that, we combine the learned features with traditional software engineering features, for accurate defect prediction. We evaluate our method on 15 open source programs. The experimental results show that in average, node2defect improves the state-of-the-art approach by 9.15\% in terms of F-measure.},
  eventtitle = {{{ASE}} '18: 33rd {{ACM}}/{{IEEE International Conference}} on {{Automated Software Engineering}}},
  isbn = {978-1-4503-5937-5},
  langid = {english},
  annotation = {1 citations (Semantic Scholar/DOI) [2021-05-17]},
  file = {/Users/ryedida/Zotero/storage/SXR3KEGS/Qu et al. - 2018 - node2defect using network embedding to improve so.pdf}
}

@article{Radrof2019,
  title = {Language {{Models}} Are {{Unsupervised Multitask Learners}}},
  author = {Radrof, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
  date = {2019},
  journaltitle = {arXiv},
  url = {https://github.com/codelucas/newspaper},
  abstract = {Natural language processing tasks, such as question answering, machine translation, reading comprehension , and summarization, are typically approached with supervised learning on task-specific datasets. We demonstrate that language models begin to learn these tasks without any explicit supervision when trained on a new dataset of millions of webpages called WebText. When conditioned on a document plus questions, the answers generated by the language model reach 55 F1 on the CoQA dataset-matching or exceeding the performance of 3 out of 4 baseline systems without using the 127,000+ training examples. The capacity of the language model is essential to the success of zero-shot task transfer and increasing it improves performance in a log-linear fashion across tasks. Our largest model, GPT-2, is a 1.5B parameter Transformer that achieves state of the art results on 7 out of 8 tested language modeling datasets in a zero-shot setting but still underfits WebText. Samples from the model reflect these improvements and contain coherent paragraphs of text. These findings suggest a promising path towards building language processing systems which learn to perform tasks from their naturally occurring demonstrations.},
  file = {/Users/ryedida/Zotero/storage/TY83GFXA/Radrof et al. - 2019 - Language Models are Unsupervised Multitask Learners(2).pdf}
}

@article{Raghunathan2018,
  title = {Certified Defenses against Adversarial Examples},
  author = {Raghunathan, Aditi and Steinhardt, Jacob and Liang, Percy},
  date = {2018},
  journaltitle = {6th International Conference on Learning Representations, ICLR 2018 - Conference Track Proceedings},
  eprint = {1801.09344},
  eprinttype = {arxiv},
  pages = {1--15},
  abstract = {While neural networks have achieved high accuracy on standard image classification benchmarks, their accuracy drops to nearly zero in the presence of small adversarial perturbations to test inputs. Defenses based on regularization and adversarial training have been proposed, but often followed by new, stronger attacks that defeat these defenses. Can we somehow end this arms race? In this work, we study this problem for neural networks with one hidden layer. We first propose a method based on a semidefinite relaxation that outputs a certificate that for a given network and test input, no attack can force the error to exceed a certain value. Second, as this certificate is differentiable, we jointly optimize it with the network parameters, providing an adaptive regularizer that encourages robustness against all attacks. On MNIST, our approach produces a network and a certificate that no attack that perturbs each pixel by at most = 0.1 can cause more than 35\% test error.},
  file = {/Users/ryedida/Zotero/storage/BEPN83CS/Raghunathan, Steinhardt, Liang - 2018 - Certified defenses against adversarial examples(2).pdf}
}

@inproceedings{rahman2014comparing,
  title = {Comparing Static Bug Finders and Statistical Prediction},
  booktitle = {{{ICSE}}},
  author = {Rahman, F and Khatri, S and Barr, E T and Devanbu, P},
  date = {2014},
  publisher = {{ACM}}
}

@inproceedings{rahul16fse,
  title = {The "{{BigSE}}" Project: {{Lessons}} Learned from Validating Industrial Text Mining},
  booktitle = {Proceedings of the 2nd International Workshop on {{BIG}} Data Software Engineering},
  author = {Krishna, Rahul and Yu, Zhe and Agrawal, Amritanshu and Dominguez, Manuel and Wolf, David},
  date = {2016},
  series = {{{BIGDSE}} '16},
  pages = {65--71},
  publisher = {{Association for Computing Machinery}},
  location = {{New York, NY, USA}},
  doi = {10.1145/2896825.2896836},
  url = {https://doi.org/10.1145/2896825.2896836},
  abstract = {As businesses become increasingly reliant on big data analytics, it becomes increasingly important to test the choices made within the data miners. This paper reports lessons learned from the BigSE Lab, an industrial/university collaboration that augments industrial activity with low-cost testing of data miners (by graduate students).BigSE is an experiment in academic/ industrial collaboration. Funded by a gift from LexisNexis, BigSE has no specific deliverables. Rather, it is fueled by a research question "what can industry and academia learn from each other?". Based on open source data and tools, the output of this work is (a) more exposure by commercial engineers to state-of-the-art methods and (b) more exposure by students to industrial text mining methods (plus research papers that comment on methods on how to improve those methods).The results so far are encouraging. Students at BigSE Lab have found numerous "standard" choices for text mining that could be replaced by simpler and less resource intensive methods. Further, that work also found additional text mining choices that could significantly improve the performance of industrial data miners.},
  isbn = {978-1-4503-4152-3},
  pagetotal = {7},
  keywords = {e-discovery,software engineering,testing}
}

@unpublished{Rajadesingan2019,
  title = {Smart, {{Responsible}}, and {{Upper Caste Only}}: {{Measuring Caste Attitudes}} through {{Large-Scale Analysis}} of {{Matrimonial Profiles}}},
  author = {Rajadesingan, Ashwin and Mahalingam, Ramaswami and Jurgens, David},
  date = {2019},
  eprint = {1904.04176},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1904.04176},
  abstract = {Discriminatory caste attitudes currently stigmatize millions of Indians, subjecting individuals to prejudice in all aspects of life. Governmental incentives and societal movements have attempted to counter these attitudes, yet accurate measurements of public opinions on caste are not yet available for understanding whether progress is being made. Here, we introduce a novel approach to measure public attitudes of caste through an indicator variable: openness to intercaste marriage. Using a massive dataset of over 313K profiles from a major Indian matrimonial site, we precisely quantify public attitudes, along with differences between generations and between Indian residents and diaspora. We show that younger generations are more open to intercaste marriage, yet attitudes are based on a complex function of social status beyond their own caste. In examining the desired qualities in a spouse, we find that individuals open to intercaste marriage are more individualistic in the qualities they desire, rather than favoring family-related qualities, which mirrors larger societal trends away from collectivism. Finally, we show that attitudes in diaspora are significantly less open, suggesting a bi-cultural model of integration. Our research provides the first empirical evidence identifying how various intersections of identity shape attitudes toward intercaste marriage in India and among the Indian diaspora in the US.},
  file = {/Users/ryedida/Zotero/storage/3FEP8KS7/Rajadesingan, Mahalingam, Jurgens - 2019 - Smart, Responsible, and Upper Caste Only Measuring Caste Attitudes through Large-Scale Ana(2).pdf}
}

@article{ramanFundusPhotographbasedDeep,
  title = {Fundus Photograph-Based Deep Learning Algorithms in Detecting Diabetic Retinopathy},
  author = {Raman, Rajiv},
  pages = {13},
  abstract = {Remarkable advances in biomedical research have led to the generation of large amounts of data. Using artificial intelligence, it has become possible to extract meaningful information from large volumes of data, in a shorter frame of time, with very less human interference. In effect, convolutional neural networks (a deep learning method) have been taught to recognize pathological lesions from images. Diabetes has high morbidity, with millions of people who need to be screened for diabetic retinopathy (DR). Deep neural networks offer a great advantage of screening for DR from retinal images, in improved identification of DR lesions and risk factors for diseases, with high accuracy and reliability. This review aims to compare the current evidences on various deep learning models for diagnosis of diabetic retinopathy (DR).},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/YREYV764/Raman - Fundus photograph-based deep learning algorithms i.pdf}
}

@report{ranjanASAPAdaptiveStructure,
  title = {{{ASAP}}: {{Adaptive Structure Aware Pooling}} for {{Learning Hierarchical Graph Representations}}},
  author = {Ranjan, Ekagra and Sanyal, Soumya and Talukdar, Partha},
  eprint = {1911.07979v3},
  eprinttype = {arxiv},
  url = {www.aaai.org},
  urldate = {2021-03-04},
  abstract = {Graph Neural Networks (GNN) have been shown to work effectively for modeling graph structured data to solve tasks such as node classification, link prediction and graph classification. There has been some recent progress in defining the notion of pooling in graphs whereby the model tries to generate a graph level representation by downsampling and summarizing the information present in the nodes. Existing pooling methods either fail to effectively capture the graph sub-structure or do not easily scale to large graphs. In this work, we propose ASAP (Adaptive Structure Aware Pooling), a sparse and differentiable pooling method that addresses the limitations of previous graph pooling architectures. ASAP utilizes a novel self-attention network along with a modified GNN formulation to capture the importance of each node in a given graph. It also learns a sparse soft cluster assignment for nodes at each layer to effectively pool the subgraphs to form the pooled graph. Through extensive experiments on multiple datasets and theoretical analysis, we motivate our choice of the components used in ASAP. Our experimental results show that combining existing GNN architectures with ASAP leads to state-of-the-art results on multiple graph classification benchmarks. ASAP has an average improvement of 4\%, compared to current sparse hierarchical state-of-the-art method.}
}

@inproceedings{real2019regularized,
  title = {Regularized Evolution for Image Classifier Architecture Search},
  booktitle = {Proceedings of the {{AAAI}} Conference on Artificial Intelligence},
  author = {Real, Esteban and Aggarwal, Alok and Huang, Yanping and Le, Quoc V},
  date = {2019},
  volume = {33},
  pages = {4780--4789}
}

@article{reddiCONVERGENCEADAM2018,
  title = {{{ON THE CONVERGENCE OF ADAM AND BEYOND}}},
  author = {Reddi, Sashank J and Kale, Satyen and Kumar, Sanjiv},
  date = {2018},
  abstract = {Several recently proposed stochastic optimization methods that have been successfully used in training deep networks such as RMSPROP, ADAM, ADADELTA, NADAM are based on using gradient updates scaled by square roots of exponential moving averages of squared past gradients. In many applications, e.g. learning with large output spaces, it has been empirically observed that these algorithms fail to converge to an optimal solution (or a critical point in nonconvex settings). We show that one cause for such failures is the exponential moving average used in the algorithms. We provide an explicit example of a simple convex optimization setting where ADAM does not converge to the optimal solution, and describe the precise problems with the previous analysis of ADAM algorithm. Our analysis suggests that the convergence issues can be fixed by endowing such algorithms with “long-term memory” of past gradients, and propose new variants of the ADAM algorithm which not only fix the convergence issues but often also lead to improved empirical performance.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/ISQTEWGW/Reddi et al. - 2018 - ON THE CONVERGENCE OF ADAM AND BEYOND.pdf}
}

@article{renLearningReweightExamples,
  title = {Learning to {{Reweight Examples}} for {{Robust Deep Learning}}},
  author = {Ren, Mengye and Zeng, Wenyuan and Yang, Bin and Urtasun, Raquel},
  pages = {10},
  abstract = {Deep neural networks have been shown to be very powerful modeling tools for many supervised learning tasks involving complex input patterns. However, they can also easily overfit to training set biases and label noises. In addition to various regularizers, example reweighting algorithms are popular solutions to these problems, but they require careful tuning of additional hyperparameters, such as example mining schedules and regularization hyperparameters. In contrast to past reweighting methods, which typically consist of functions of the cost value of each example, in this work we propose a novel meta-learning algorithm that learns to assign weights to training examples based on their gradient directions. To determine the example weights, our method performs a meta gradient descent step on the current mini-batch example weights (which are initialized from zero) to minimize the loss on a clean unbiased validation set. Our proposed method can be easily implemented on any type of deep network, does not require any additional hyperparameter tuning, and achieves impressive performance on class imbalance and corrupted label problems where only a small amount of clean validation data is available.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/8FHNX5F3/Ren et al. - Learning to Reweight Examples for Robust Deep Lear.pdf}
}

@unpublished{Ribeiro2016,
  title = {"{{Why Should I Trust You}}?": {{Explaining}} the {{Predictions}} of {{Any Classifier}}},
  author = {Ribeiro, Marco Tulio and Singh, Sameer and Guestrin, Carlos},
  date = {2016},
  eprint = {214160309},
  eprinttype = {pmid},
  issn = {9781450321389},
  doi = {10.18653/v1/N16-3020},
  abstract = {Despite widespread adoption, machine learning models remain mostly black boxes. Understanding the reasons behind predictions is, however, quite important in assessing trust, which is fundamental if one plans to take action based on a prediction, or when choosing whether to deploy a new model. Such understanding also provides insights into the model, which can be used to transform an untrustworthy model or prediction into a trustworthy one. In this work, we propose LIME, a novel explanation technique that explains the predictions of any classifier in an interpretable and faithful manner, by learning an interpretable model locally around the prediction. We also propose a method to explain models by presenting representative individual predictions and their explanations in a non-redundant way, framing the task as a submodular optimization problem. We demonstrate the flexibility of these methods by explaining different models for text (e.g. random forests) and image classification (e.g. neural networks). We show the utility of explanations via novel experiments, both simulated and with human subjects, on various scenarios that require trust: deciding if one should trust a prediction, choosing between models, improving an untrustworthy classifier, and identifying why a classifier should not be trusted.},
  isbn = {9781450321389}
}

@article{riceOverfittingAdversariallyRobust,
  title = {Overfitting in Adversarially Robust Deep Learning},
  author = {Rice, Leslie and Wong, Eric and Kolter, J Zico},
  pages = {12},
  abstract = {It is common practice in deep learning to use overparameterized networks and train for as long as possible; there are numerous studies that show, both theoretically and empirically, that such practices surprisingly do not unduly harm the generalization performance of the classifier. In this paper, we empirically study this phenomenon in the setting of adversarially trained deep networks, which are trained to minimize the loss under worst-case adversarial perturbations. We find that overfitting to the training set does in fact harm robust performance to a very large degree in adversarially robust training across multiple datasets (SVHN, CIFAR-10, CIFAR-100, and ImageNet) and perturbation models (`∞ and `2). Based upon this observed effect, we show that the performance gains of virtually all recent algorithmic improvements upon adversarial training can be matched by simply using early stopping. We also show that effects such as the double descent curve do still occur in adversarially trained models, yet fail to explain the observed overfitting. Finally, we study several classical and modern deep learning remedies for overfitting, including regularization and data augmentation, and find that no approach in isolation improves significantly upon the gains achieved by early stopping. All code for reproducing the experiments as well as pretrained model weights and training logs can be found at https://github.com/ locuslab/robust\_overfitting.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/4N2K5XRP/Rice et al. - Overfitting in adversarially robust deep learning.pdf}
}

@article{rissanenUniversalPriorIntegers1983,
  title = {A Universal Prior for Integers and Estimation by Minimum Description Length},
  author = {Rissanen, Jorma},
  date = {1983},
  journaltitle = {The Annals of statistics},
  volume = {11},
  number = {2},
  pages = {416--431},
  publisher = {{Institute of Mathematical Statistics}},
  isbn = {0090-5364},
  file = {/Users/ryedida/Zotero/storage/DYVIUH6D/Rissanen_1983_A universal prior for integers and estimation by minimum description length.pdf}
}

@article{Ronneberger2015,
  title = {U-Net: {{Convolutional}} Networks for Biomedical Image Segmentation},
  author = {Ronneberger, Olaf and Fischer, Philipp and Brox, Thomas},
  date = {2015},
  journaltitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  volume = {9351},
  eprint = {23285570},
  eprinttype = {pmid},
  pages = {234--241},
  issn = {16113349},
  doi = {10.1007/978-3-319-24574-4_28},
  abstract = {There is large consent that successful training of deep networks requires many thousand annotated training samples. In this paper, we present a network and training strategy that relies on the strong use of data augmentation to use the available annotated samples more efficiently. The architecture consists of a contracting path to capture context and a symmetric expanding path that enables precise localization. We show that such a network can be trained end-to-end from very few images and outperforms the prior best method (a sliding-window convolutional network) on the ISBI challenge for segmentation of neuronal structures in electron microscopic stacks. Using the same network trained on transmitted light microscopy images (phase contrast and DIC) we won the ISBI cell tracking challenge 2015 in these categories by a large margin. Moreover, the network is fast. Segmentation of a 512x512 image takes less than a second on a recent GPU. The full implementation (based on Caffe) and the trained networks are available at http://lmb.informatik.uni-freiburg.de/people/ronneber/u-net .},
  isbn = {9783319245737},
  file = {/Users/ryedida/Zotero/storage/VL28DD4M/Ronneberger, Fischer, Brox - 2015 - U-net Convolutional networks for biomedical image segmentation(2).pdf}
}

@article{rosenthal1994parametric,
  title = {Parametric Measures of Effect Size},
  author = {Rosenthal, Robert and Cooper, Harris and Hedges, L},
  date = {1994},
  journaltitle = {The handbook of research synthesis},
  volume = {621},
  number = {2},
  pages = {231--244}
}

@unpublished{Ross2018,
  title = {Learning {{Qualitatively Diverse}} and {{Interpretable Rules}} for {{Classification}}},
  author = {Ross, Andrew Slavin and Pan, Weiwei and Doshi-Velez, Finale},
  date = {2018},
  eprint = {1806.08716},
  eprinttype = {arxiv},
  pages = {87--94},
  doi = {arXiv:1806.08716v2},
  url = {http://arxiv.org/abs/1806.08716},
  abstract = {There has been growing interest in developing accurate models that can also be explained to humans. Unfortunately, if there exist multiple distinct but accurate models for some dataset, current machine learning methods are unlikely to find them: standard techniques will likely recover a complex model that combines them. In this work, we introduce a way to identify a maximal set of distinct but accurate models for a dataset. We demonstrate empirically that, in situations where the data supports multiple accurate classifiers, we tend to recover simpler, more interpretable classifiers rather than more complex ones.}
}

@unpublished{rothOddsAreOdd2019,
  title = {The {{Odds}} Are {{Odd}}: {{A Statistical Test}} for {{Detecting Adversarial Examples}}},
  shorttitle = {The {{Odds}} Are {{Odd}}},
  author = {Roth, Kevin and Kilcher, Yannic and Hofmann, Thomas},
  date = {2019-05-09},
  eprint = {1902.04818},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1902.04818},
  urldate = {2021-04-19},
  abstract = {We investigate conditions under which test statistics exist that can reliably detect examples, which have been adversarially manipulated in a whitebox attack. These statistics can be easily computed and calibrated by randomly corrupting inputs. They exploit certain anomalies that adversarial attacks introduce, in particular if they follow the paradigm of choosing perturbations optimally under p-norm constraints. Access to the log-odds is the only requirement to defend models. We justify our approach empirically, but also provide conditions under which detectability via the suggested test statistics is guaranteed to be effective. In our experiments, we show that it is even possible to correct test time predictions for adversarial attacks with high accuracy.},
  langid = {english},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  annotation = {52 citations (Semantic Scholar/arXiv) [2021-04-18]},
  file = {/Users/ryedida/Zotero/storage/5AWJST65/Roth et al. - 2019 - The Odds are Odd A Statistical Test for Detecting.pdf}
}

@unpublished{Rotich2018,
  title = {Resource-{{Constrained Simultaneous Detection}} and {{Labeling}} of {{Objects}} in {{High-Resolution Satellite Images}}},
  author = {Rotich, Gilbert and Minetto, Rodrigo and Sarkar, Sudeep},
  date = {2018},
  eprint = {1810.10110},
  eprinttype = {arxiv},
  doi = {arXiv:1810.10110v1},
  url = {http://arxiv.org/abs/1810.10110},
  abstract = {We describe a strategy for detection and classification of man-made objects in large high-resolution satellite photos under computational resource constraints. We detect and classify candidate objects by using five pipelines of convolutional neural network processing (CNN), run in parallel. Each pipeline has its own unique strategy for fine tunning parameters, proposal region filtering, and dealing with image scales. The conflicting region proposals are merged based on region confidence and not just based on overlap areas, which improves the quality of the final bounding-box regions selected. We demonstrate this strategy using the recent xView challenge, which is a complex benchmark with more than 1,100 high-resolution images, spanning 800,000 aerial objects around the world covering a total area of 1,400 square kilometers at 0.3 meter ground sample distance. To tackle the resource-constrained problem posed by the xView challenge, where inferences are restricted to be on CPU with 8GB memory limit, we used lightweight CNN's trained with the single shot detector algorithm. Our approach was competitive on sequestered sets; it was ranked third.},
  file = {/Users/ryedida/Zotero/storage/59BPW94W/Rotich, Minetto, Sarkar - 2018 - Resource-Constrained Simultaneous Detection and Labeling of Objects in High-Resolution Satellite Ima(2).pdf}
}

@unpublished{Rozsa2017,
  title = {Are {{Accuracy}} and {{Robustness Correlated}}},
  author = {Rozsa, Andras and Gunther, Manuel and Boult, Terrance E.},
  date = {2017},
  eprint = {1610.04563},
  eprinttype = {arxiv},
  pages = {227--232},
  doi = {10.1109/icmla.2016.0045},
  abstract = {Machine learning models are vulnerable to adversarial examples formed by applying small carefully chosen perturbations to inputs that cause unexpected classification errors. In this paper, we perform experiments on various adversarial example generation approaches with multiple deep convolutional neural networks including Residual Networks, the best performing models on ImageNet Large-Scale Visual Recognition Challenge 2015. We compare the adversarial example generation techniques with respect to the quality of the produced images, and measure the robustness of the tested machine learning models to adversarial examples. Finally, we conduct large-scale experiments on cross-model adversarial portability. We find that adversarial examples are mostly transferable across similar network topologies, and we demonstrate that better machine learning models are less vulnerable to adversarial examples.},
  isbn = {9781509061679},
  file = {/Users/ryedida/Zotero/storage/XA79ZXL2/Rozsa, Gunther, Boult - 2017 - Are Accuracy and Robustness Correlated(2).pdf}
}

@book{rudinPrinciplesMathematicalAnalysis1953,
  title = {Principles of Mathematical Analysis.},
  author = {Rudin, 1921-2010, Walter},
  date = {1953},
  publisher = {{New York : McGraw-Hill, 1953.}},
  location = {{New York}},
  url = {https://catalog.lib.ncsu.edu/catalog/NCSU189792},
  langid = {english},
  keywords = {Calculus,Functions},
  file = {/Users/ryedida/Zotero/storage/234J24MP/Baby Rudin Guide.pdf;/Users/ryedida/Zotero/storage/6QPB7KF7/Rudin_1953_Principles of mathematical analysis.pdf;/Users/ryedida/Zotero/storage/F8PXQQXT/Baby Rudin Solutions.pdf}
}

@book{rudinRealComplexAnalysis1974,
  title = {Real and Complex Analysis.},
  author = {Rudin, 1921-2010, Walter},
  date = {1974},
  publisher = {{New York : McGraw-Hill, [1974]}},
  location = {{New York}},
  url = {https://catalog.lib.ncsu.edu/catalog/NCSU196187},
  langid = {english},
  keywords = {Mathematical analysis},
  file = {/Users/ryedida/Zotero/storage/5SR86HLS/Rudin_1974_Real and complex analysis.pdf}
}

@article{Saeed2018,
  title = {{{ASQFor}}: {{Automatic SPARQL}} Query Formulation for the Non-Expert},
  author = {Saeed, Muhammad Rizwan and Chelmis, Charalampos and Prasanna, Viktor K.},
  date = {2018},
  journaltitle = {AI Communications},
  volume = {31},
  number = {1},
  pages = {19--32},
  issn = {09217126},
  doi = {10.3233/AIC-170746},
  abstract = {The combination of data, semantics, and the Web has led to an ever growing and increasingly complex body of semantic data. Accessing such structured data requires learning formal query languages, such as SPARQL, which poses significant difficulties for non-expert users. To date, many interfaces for querying Ontologies have been developed. However, such interfaces rely on predefined templates or require expensive pre-processing and customization. Natural Language (NL) interfaces are particularly preferable to other interfaces for providing users with access to data. However the inherent difficulty in mapping NL queries to semantic data can create ambiguities during query formulation phase. To avoid the pitfalls of existing approaches, while at the same time retaining the ability to capture users' complex information needs, we propose a simple keyword-based search interface to the Semantic Web. Specifically, we propose Automatic SPARQL Query Formulation (ASQFor), a systematic framework to issue semantic queries over RDF repositories using simple concept-based search primitives. ASQFor has a very simple interface, requires no user training, and can be easily embedded in any system or used with any semantic repository without prior customization. We demonstrate via extensive experimentation that ASQFor significantly speeds up query formulation while at the same time matching the syntax of hand-crafted queries.},
  keywords = {Automatic query formulation,Ontologies,RDF,Semantic web,SPARQL},
  file = {/Users/ryedida/Zotero/storage/CB6ZU5A9/Saeed, Chelmis, Prasanna - 2018 - ASQFor Automatic SPARQL query formulation for the non-expert(2).pdf}
}

@unpublished{Salimans2016,
  title = {Weight {{Normalization}}: {{A Simple Reparameterization}} to {{Accelerate Training}} of {{Deep Neural Networks}}},
  author = {Salimans, Tim and Kingma, Diederik P.},
  date = {2016},
  eprint = {172668},
  eprinttype = {pmid},
  issn = {09252312},
  doi = {http://doi.acm.org.ezproxy.lib.ucf.edu/10.1145/1830483.1830503},
  url = {http://arxiv.org/abs/1602.07868},
  abstract = {We present weight normalization: a reparameterization of the weight vectors in a neural network that decouples the length of those weight vectors from their direction. By reparameterizing the weights in this way we improve the conditioning of the optimization problem and we speed up convergence of stochastic gradient descent. Our reparameterization is inspired by batch normalization but does not introduce any dependencies between the examples in a minibatch. This means that our method can also be applied successfully to recurrent models such as LSTMs and to noise-sensitive applications such as deep reinforcement learning or generative models, for which batch normalization is less well suited. Although our method is much simpler, it still provides much of the speed-up of full batch normalization. In addition, the computational overhead of our method is lower, permitting more optimization steps to be taken in the same amount of time. We demonstrate the usefulness of our method on applications in supervised image recognition, generative modelling, and deep reinforcement learning.},
  isbn = {9781450300728},
  issue = {Nips},
  file = {/Users/ryedida/Zotero/storage/S6PSJ7KC/Salimans, Kingma - 2016 - Weight Normalization A Simple Reparameterization to Accelerate Training of Deep Neural Networks(2).pdf}
}

@article{Samangouei2018,
  title = {Defense-{{Gan}}: {{Protecting}} Classifiers against Adversarial Attacks Using Generative Models},
  author = {Samangouei, Pouya and Kabkab, Maya and Chellappa, Rama},
  date = {2018},
  journaltitle = {6th International Conference on Learning Representations, ICLR 2018 - Conference Track Proceedings},
  number = {3},
  eprint = {1805.06605},
  eprinttype = {arxiv},
  abstract = {In recent years, deep neural network approaches have been widely adopted for machine learning tasks, including classification. However, they were shown to be vulnerable to adversarial perturbations: carefully crafted small perturbations can cause misclassification of legitimate images. We propose Defense-GAN, a new framework leveraging the expressive capability of generative models to defend deep neural networks against such attacks. Defense-GAN is trained to model the distribution of unperturbed images. At inference time, it finds a close output to a given image which does not contain the adversarial changes. This output is then fed to the classifier. Our proposed method can be used with any classification model and does not modify the classifier structure or training procedure. It can also be used as a defense against any attack as it does not assume knowledge of the process for generating the adversarial examples. We empirically show that Defense-GAN is consistently effective against different attack methods and improves on existing defense strategies.},
  file = {/Users/ryedida/Zotero/storage/KNKCBU33/Samangouei, Kabkab, Chellappa - 2018 - Defense-Gan Protecting classifiers against adversarial attacks using generative models(2).pdf}
}

@article{santurkar2018does,
  title = {How Does Batch Normalization Help Optimization?},
  author = {Santurkar, Shibani and Tsipras, Dimitris and Ilyas, Andrew and Madry, Aleksander},
  date = {2018},
  journaltitle = {Advances in neural information processing systems},
  volume = {31},
  file = {/Users/ryedida/Zotero/storage/F3I2DANR/Santurkar et al_2018_How does batch normalization help optimization.pdf}
}

@online{saxeExactSolutionsNonlinear2014,
  title = {Exact Solutions to the Nonlinear Dynamics of Learning in Deep Linear Neural Networks},
  author = {Saxe, Andrew M. and McClelland, James L. and Ganguli, Surya},
  date = {2014-02-19},
  eprint = {1312.6120},
  eprinttype = {arxiv},
  eprintclass = {cond-mat, q-bio, stat},
  url = {http://arxiv.org/abs/1312.6120},
  urldate = {2023-11-26},
  abstract = {Despite the widespread practical success of deep learning methods, our theoretical understanding of the dynamics of learning in deep neural networks remains quite sparse. We attempt to bridge the gap between the theory and practice of deep learning by systematically analyzing learning dynamics for the restricted case of deep linear neural networks. Despite the linearity of their input-output map, such networks have nonlinear gradient descent dynamics on weights that change with the addition of each new hidden layer. We show that deep linear networks exhibit nonlinear learning phenomena similar to those seen in simulations of nonlinear networks, including long plateaus followed by rapid transitions to lower error solutions, and faster convergence from greedy unsupervised pretraining initial conditions than from random initial conditions. We provide an analytical description of these phenomena by finding new exact solutions to the nonlinear dynamics of deep learning. Our theoretical analysis also reveals the surprising finding that as the depth of a network approaches infinity, learning speed can nevertheless remain finite: for a special class of initial conditions on the weights, very deep networks incur only a finite, depth independent, delay in learning speed relative to shallow networks. We show that, under certain conditions on the training data, unsupervised pretraining can find this special class of initial conditions, while scaled random Gaussian initializations cannot. We further exhibit a new class of random orthogonal initial conditions on weights that, like unsupervised pre-training, enjoys depth independent learning times. We further show that these initial conditions also lead to faithful propagation of gradients even in deep nonlinear networks, as long as they operate in a special regime known as the edge of chaos.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing,Condensed Matter - Disordered Systems and Neural Networks,Quantitative Biology - Neurons and Cognition,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/M2V9YB7P/Saxe et al. - 2014 - Exact solutions to the nonlinear dynamics of learn.pdf}
}

@inproceedings{saxena2016convolutional,
  title = {Convolutional Neural Fabrics},
  booktitle = {Advances in {{Neural Information Processing Systems}}},
  author = {Saxena, Shreyas and Verbeek, Jakob},
  date = {2016},
  pages = {4053--4061}
}

@misc{Sayyad-Shirabad+Menzies:2005,
  title = {The \{\vphantom\}{{PROMISE}}\vphantom\{\} {{Repository}} of {{Software Engineering Databases}}.},
  author = {Sayyad Shirabad, J and Menzies, T J},
  date = {2005},
  url = {http://promise.site.uottawa.ca/SERepository}
}

@article{Schmidt2018,
  title = {Adversarially Robust Generalization Requires More Data},
  author = {Schmidt, Ludwig and Talwar, Kunal and Santurkar, Shibani and Tsipras, Dimitris and Madry, Aleksander},
  date = {2018},
  journaltitle = {Advances in Neural Information Processing Systems},
  volume = {2018-Decem},
  eprint = {1804.11285},
  eprinttype = {arxiv},
  pages = {5014--5026},
  issn = {10495258},
  abstract = {Machine learning models are often susceptible to adversarial perturbations of their inputs. Even small perturbations can cause state-of-the-art classifiers with high “standard” accuracy to produce an incorrect prediction with high confidence. To better understand this phenomenon, we study adversarially robust learning from the viewpoint of generalization. We show that already in a simple natural data model, the sample complexity of robust learning can be significantly larger than that of “standard” learning. This gap is information theoretic and holds irrespective of the training algorithm or the model family. We complement our theoretical results with experiments on popular image classification datasets and show that a similar gap exists here as well. We postulate that the difficulty of training robust classifiers stems, at least partially, from this inherently larger sample complexity.},
  issue = {NeurIPS},
  file = {/Users/ryedida/Zotero/storage/F494TIFF/Schmidt et al. - 2018 - Adversarially robust generalization requires more data(2).pdf}
}

@online{schmidtMinimizingFiniteSums2013,
  title = {Minimizing {{Finite Sums}} with the {{Stochastic Average Gradient}}},
  author = {Schmidt, Mark and Roux, Nicolas Le and Bach, Francis},
  date = {2013-09-10},
  url = {https://arxiv.org/abs/1309.2388v2},
  urldate = {2023-10-10},
  abstract = {We propose the stochastic average gradient (SAG) method for optimizing the sum of a finite number of smooth convex functions. Like stochastic gradient (SG) methods, the SAG method's iteration cost is independent of the number of terms in the sum. However, by incorporating a memory of previous gradient values the SAG method achieves a faster convergence rate than black-box SG methods. The convergence rate is improved from O(1/k\^\{1/2\}) to O(1/k) in general, and when the sum is strongly-convex the convergence rate is improved from the sub-linear O(1/k) to a linear convergence rate of the form O(p\^k) for p \textbackslash textless\{\} 1. Further, in many cases the convergence rate of the new method is also faster than black-box deterministic gradient methods, in terms of the number of gradient evaluations. Numerical experiments indicate that the new algorithm often dramatically outperforms existing SG and deterministic gradient methods, and that the performance may be further improved through the use of non-uniform sampling strategies.},
  langid = {english},
  organization = {{arXiv.org}},
  file = {/Users/ryedida/Zotero/storage/IGLZ6PNR/Schmidt et al_2013_Minimizing Finite Sums with the Stochastic Average Gradient.pdf}
}

@report{Schultz2014,
  title = {Tracing {{Knowledge}} and {{Affect}} in {{Parallel}} in an {{Intelligent Tutoring System Tracing Knowledge}} and {{Engagement}} in {{Parallel}} in an {{Intelligent Tutoring System}}},
  author = {Schultz, Sarah and Schultz, Sarah E and Arroyo, Ivon},
  date = {2014},
  url = {https://www.researchgate.net/publication/280156170},
  abstract = {Two of the major goals in Educational Data Mining are determining students' state of knowledge and determining whether students are affectively engaged with the task and in positive affective states. These two problems are usually examined separately and multiple methods have been proposed to solve each of them. However, little work has been done on tracing both of these states in parallel and the combined effect on a student's performance. In this work, we propose a model for tracing student engagement in parallel with knowledge as the student uses an Intelligent Tutoring System. We then compare this model to existing methods of tracing student knowledge and engagement.},
  keywords = {affect detection,behavior,engagement,Knowledge tracing,performance},
  file = {/Users/ryedida/Zotero/storage/8BC7E5MM/Schultz, Schultz, Arroyo - 2014 - Tracing Knowledge and Affect in Parallel in an Intelligent Tutoring System Tracing Knowledge and En(2).pdf}
}

@report{Schultz2015,
  title = {Tracing {{Knowledge}} and {{Engagement}} in {{Parallel}} by {{Observing Behavior}} in {{Intelligent Tutoring Systems}}},
  author = {Schultz, Sarah E},
  date = {2015},
  file = {/Users/ryedida/Zotero/storage/U4Q7DDVU/Schultz - 2015 - Tracing Knowledge and Engagement in Parallel by Observing Behavior in Intelligent Tutoring Systems(2).pdf}
}

@article{Selvaraju2017,
  title = {Grad-{{CAM}}: {{Visual Explanations}} from {{Deep Networks}} via {{Gradient-Based Localization}}},
  author = {Selvaraju, Ramprasaath R. and Cogswell, Michael and Das, Abhishek and Vedantam, Ramakrishna and Parikh, Devi and Batra, Dhruv},
  date = {2017},
  journaltitle = {Proceedings of the IEEE International Conference on Computer Vision},
  volume = {2017-Octob},
  eprint = {24880761},
  eprinttype = {pmid},
  pages = {618--626},
  issn = {15505499},
  doi = {10.1109/ICCV.2017.74},
  abstract = {We propose a technique for producing "visual explanations" for decisions from a large class of CNN-based models, making them more transparent. Our approach - Gradient-weighted Class Activation Mapping (Grad-CAM), uses the gradients of any target concept, flowing into the final convolutional layer to produce a coarse localization map highlighting the important regions in the image for predicting the concept. Unlike previous approaches, GradCAM is applicable to a wide variety of CNN model-families: (1) CNNs with fully-connected layers (e.g. VGG), (2) CNNs used for structured outputs (e.g. captioning), (3) CNNs used in tasks with multimodal inputs (e.g. VQA) or reinforcement learning, without any architectural changes or re-training. We combine GradCAM with fine-grained visualizations to create a high-resolution class-discriminative visualization and apply it to off-the-shelf image classification, captioning, and visual question answering (VQA) models, including ResNet-based architectures. In the context of image classification models, our visualizations (a) lend insights into their failure modes (showing that seemingly unreasonable predictions have reasonable explanations), (b) are robust to adversarial images, (c) outperform previous methods on weakly-supervised localization, (d) are more faithful to the underlying model and (e) help achieve generalization by identifying dataset bias. For captioning and VQA, our visualizations show that even non-attention based models can localize inputs. Finally, we conduct human studies to measure if GradCAM explanations help users establish trust in predictions from deep networks and show that GradCAM helps untrained users successfully discern a "stronger" deep network from a "weaker" one. Our code is available at https://github.com/ramprs/grad-cam. A demo and a video of the demo can be found at http://gradcam.cloudcv.org and youtu.be/COjUB9Izk6E.},
  isbn = {9781538610329},
  file = {/Users/ryedida/Zotero/storage/TVRGZWHA/Selvaraju et al. - 2017 - Grad-CAM Visual Explanations from Deep Networks via Gradient-Based Localization(2).pdf}
}

@article{senEMPIRENSEMBLESMIXED2020,
  title = {{{EMPIR}}: {{ENSEMBLES OF MIXED PRECISION DEEP NETWORKS FOR INCREASED ROBUSTNESS AGAINST ADVERSARIAL ATTACKS}}},
  author = {Sen, Sanchari and Ravindran, Balaraman and Raghunathan, Anand},
  date = {2020},
  pages = {12},
  abstract = {Ensuring robustness of Deep Neural Networks (DNNs) is crucial to their adoption in safety-critical applications such as self-driving cars, drones, and healthcare. Notably, DNNs are vulnerable to adversarial attacks in which small input perturbations can produce catastrophic misclassifications. In this work, we propose EMPIR, ensembles of quantized DNN models with different numerical precisions, as a new approach to increase robustness against adversarial attacks. EMPIR is based on the observation that quantized neural networks often demonstrate much higher robustness to adversarial attacks than full precision networks, but at the cost of a substantial loss in accuracy on the original (unperturbed) inputs. EMPIR overcomes this limitation to achieve the “best of both worlds”, i.e., the higher unperturbed accuracies of the full precision models combined with the higher robustness of the low precision models, by composing them in an ensemble. Further, as low precision DNN models have significantly lower computational and storage requirements than full precision models, EMPIR models only incur modest compute and memory overheads compared to a single full-precision model ({$<$}25\% in our evaluations). We evaluate EMPIR across a suite of DNNs for 3 different image recognition tasks (MNIST, CIFAR-10 and ImageNet) and under 4 different adversarial attacks. Our results indicate that EMPIR boosts the average adversarial accuracies by 42.6\%, 15.2\% and 10.5\% for the DNN models trained on the MNIST, CIFAR-10 and ImageNet datasets respectively, when compared to single full-precision models, without sacrificing accuracy on the unperturbed inputs.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/AMCYQ4DL/Sen et al. - 2020 - EMPIR ENSEMBLES OF MIXED PRECISION DEEP NETWORKS .pdf}
}

@article{Seong2016,
  title = {Towards {{Flatter Loss Surface}} via {{Nonmonotonic Learning Rate Scheduling}}},
  author = {Seong, Sihyeon and Lee, Yekang and Kee, Youngwook and Han, Dongyoon and Kim, Junmo},
  date = {2016},
  abstract = {Whereas optimizing deep neural networks using stochastic gradient descent has shown great performances in practice, the rule for setting step size (i.e. learning rate) of gradient descent is not well studied. Although it appears that some intriguing learning rate rules such as ADAM (Kingma and Ba, 2014) have since been developed, they concentrated on improving convergence, not on improving generalization capabilities. Recently, the improved generalization property of the flat minima was re-visited, and this research guides us towards promising solutions to many current optimization problems. In this paper, we analyze the flatness of loss surfaces through the lens of ro-bustness to input perturbations and advocate that gradient descent should be guided to reach flatter region of loss surfaces to achieve generalization. Finally, we suggest a learning rate rule for escaping sharp regions of loss surfaces, and we demonstrate the capacity of our approach by performing numerous experiments.},
  file = {/Users/ryedida/Zotero/storage/76JHQQ3P/Seong et al. - 2016 - Towards Flatter Loss Surface via Nonmonotonic Learning Rate Scheduling(2).pdf}
}

@inproceedings{settlesTrainableSpacedRepetition2016,
  title = {A {{Trainable Spaced Repetition Model}} for {{Language Learning}}},
  booktitle = {Proceedings of the 54th {{Annual Meeting}} of the {{Association}} for {{Computational Linguistics}} ({{Volume}} 1: {{Long Papers}})},
  author = {Settles, Burr and Meeder, Brendan},
  date = {2016},
  pages = {1848--1858},
  publisher = {{Association for Computational Linguistics}},
  location = {{Berlin, Germany}},
  doi = {10.18653/v1/P16-1174},
  url = {http://aclweb.org/anthology/P16-1174},
  urldate = {2023-12-09},
  abstract = {We present half-life regression (HLR), a novel model for spaced repetition practice with applications to second language acquisition. HLR combines psycholinguistic theory with modern machine learning techniques, indirectly estimating the “halflife” of a word or concept in a student’s long-term memory. We use data from Duolingo — a popular online language learning application — to fit HLR models, reducing error by 45\%+ compared to several baselines at predicting student recall rates. HLR model weights also shed light on which linguistic concepts are systematically challenging for second language learners. Finally, HLR was able to improve Duolingo daily student engagement by 12\% in an operational user study.},
  eventtitle = {Proceedings of the 54th {{Annual Meeting}} of the {{Association}} for {{Computational Linguistics}} ({{Volume}} 1: {{Long Papers}})},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/XZPAM3TV/Settles and Meeder - 2016 - A Trainable Spaced Repetition Model for Language L.pdf}
}

@article{shafahiAreAdversarialExamples2019,
  title = {Are Adversarial Examples Inevitable?},
  author = {Shafahi, Ali and Huang, Ronny and Studer, Christoph and Feizi, Soheil and Goldstein, Tom},
  date = {2019},
  journaltitle = {7th International Conference on Learning Representations, ICLR 2019},
  eprint = {1809.02104},
  eprinttype = {arxiv},
  pages = {1--17},
  abstract = {A wide range of defenses have been proposed to harden neural networks against adversarial attacks. However, a pattern has emerged in which the majority of adversarial defenses are quickly broken by new attacks. Given the lack of success at generating robust defenses, we are led to ask a fundamental question: Are adversarial attacks inevitable? This paper analyzes adversarial examples from a theoretical perspective, and identifies fundamental bounds on the susceptibility of a classifier to adversarial attacks. We show that, for certain classes of problems, adversarial examples are inescapable. Using experiments, we explore the implications of theoretical guarantees for real-world problems and discuss how factors such as dimensionality and image complexity limit a classifier's robustness against adversarial examples.},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/TU3IQ63D/Shafahi et al. - 2020 - Are adversarial examples inevitable.pdf}
}

@article{shahriari2015taking,
  title = {Taking the Human out of the Loop: {{A}} Review of {{Bayesian}} Optimization},
  author = {Shahriari, Bobak and Swersky, Kevin and Wang, Ziyu and Adams, Ryan P and De Freitas, Nando},
  date = {2015},
  journaltitle = {Proceedings of the IEEE},
  volume = {104},
  number = {1},
  pages = {148--175},
  publisher = {{IEEE}}
}

@book{shalev-shwartzUnderstandingMachineLearning2014,
  title = {Understanding {{Machine Learning}}: {{From Theory}} to {{Algorithms}}},
  author = {Shalev-Shwartz, Shai and Ben-David, Shai},
  date = {2014},
  publisher = {{Cambridge University Press}},
  location = {{Cambridge}},
  doi = {10.1017/CBO9781107298019},
  url = {https://www.cambridge.org/core/books/understanding-machine-learning/3059695661405D25673058E43C8BE2A6},
  abstract = {Machine learning is one of the fastest growing areas of computer science, with far-reaching applications. The aim of this textbook is to introduce machine learning, and the algorithmic paradigms it offers, in a principled way. The book provides a theoretical account of the fundamentals underlying machine learning and the mathematical derivations that transform these principles into practical algorithms. Following a presentation of the basics, the book covers a wide array of central topics unaddressed by previous textbooks. These include a discussion of the computational complexity of learning and the concepts of convexity and stability; important algorithmic paradigms including stochastic gradient descent, neural networks, and structured output learning; and emerging theoretical concepts such as the PAC-Bayes approach and compression-based bounds. Designed for advanced undergraduates or beginning graduates, the text makes the fundamentals and algorithms of machine learning accessible to students and non-expert readers in statistics, computer science, mathematics and engineering.},
  isbn = {978-1-107-05713-5},
  file = {/Users/ryedida/Zotero/storage/PB26AWBU/Shalev-Shwartz_Ben-David_2014_Understanding Machine Learning.pdf}
}

@online{shamirSimpleExplanationExistence2019,
  title = {A {{Simple Explanation}} for the {{Existence}} of {{Adversarial Examples}} with {{Small Hamming Distance}}},
  author = {Shamir, Adi and Safran, Itay and Ronen, Eyal and Dunkelman, Orr},
  date = {2019-01-30},
  eprint = {1901.10861},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1901.10861},
  urldate = {2023-12-09},
  abstract = {The existence of adversarial examples in which an imperceptible change in the input can fool well trained neural networks was experimentally discovered by Szegedy et al in 2013, who called them “Intriguing properties of neural networks”. Since then, this topic had become one of the hottest research areas within machine learning, but the ease with which we can switch between any two decisions in targeted attacks is still far from being understood, and in particular it is not clear which parameters determine the number of input coordinates we have to change in order to mislead the network. In this paper we develop a simple mathematical framework which enables us to think about this baffling phenomenon from a fresh perspective, turning it into a natural consequence of the geometry of Rn with the L0 (Hamming) metric, which can be quantitatively analyzed. In particular, we explain why we should expect to find targeted adversarial examples with Hamming distance of roughly m in arbitrarily deep neural networks which are designed to distinguish between m input classes.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Cryptography and Security,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/WVNYJPLT/Shamir et al. - 2019 - A Simple Explanation for the Existence of Adversar.pdf}
}

@article{shannonMathematicalTheoryCommunication,
  title = {A Mathematical Theory of Communication},
  author = {Shannon, C E},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/TSNWN8VX/Shannon - A mathematical theory of communication.pdf}
}

@article{Shao2017,
  title = {Scanpath {{Prediction Based}} on {{High-Level Features}} and {{Memory Bias}}},
  author = {Shao, Xuan and Luo, Ye and Zhu, Dandan and Li, Shuqin and Itti, Laurent and Lu, Jianwei},
  date = {2017},
  journaltitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  volume = {10636 LNCS},
  pages = {3--13},
  issn = {16113349},
  doi = {10.1007/978-3-319-70090-8_1},
  isbn = {9783319700892},
  keywords = {Fixation duration,Memory bias,Scanpath prediction,Semantic features}
}

@unpublished{shenBaselineNeedsMore2018,
  title = {Baseline {{Needs More Love}}: {{On Simple Word-Embedding-Based Models}} and {{Associated Pooling Mechanisms}}},
  shorttitle = {Baseline {{Needs More Love}}},
  author = {Shen, Dinghan and Wang, Guoyin and Wang, Wenlin and Min, Martin Renqiang and Su, Qinliang and Zhang, Yizhe and Li, Chunyuan and Henao, Ricardo and Carin, Lawrence},
  date = {2018-05-24},
  eprint = {1805.09843},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/1805.09843},
  urldate = {2022-01-07},
  abstract = {Many deep learning architectures have been proposed to model the compositionality in text sequences, requiring a substantial number of parameters and expensive computations. However, there has not been a rigorous evaluation regarding the added value of sophisticated compositional functions. In this paper, we conduct a point-by-point comparative study between Simple Word-Embeddingbased Models (SWEMs), consisting of parameter-free pooling operations, relative to word-embedding-based RNN/CNN models. Surprisingly, SWEMs exhibit comparable or even superior performance in the majority of cases considered. Based upon this understanding, we propose two additional pooling strategies over learned word embeddings: (i) a max-pooling operation for improved interpretability; and (ii) a hierarchical pooling operation, which preserves spatial (n-gram) information within text sequences. We present experiments on 17 datasets encompassing three tasks: (i) (long) document classification; (ii) text sequence matching; and (iii) short text tasks, including classification and tagging. The source code and datasets can be obtained from https:// github.com/dinghanshen/SWEM.},
  langid = {english},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning,simple},
  annotation = {222 citations (Semantic Scholar/arXiv) [2022-01-07]},
  file = {/Users/ryedida/Zotero/storage/2RTNP9NA/Shen et al. - 2018 - Baseline Needs More Love On Simple Word-Embedding.pdf}
}

@inproceedings{shenMathematicalUnderstandingDifficulty2018,
  title = {Towards a {{Mathematical Understanding}} of the {{Difficulty}} in {{Learning}} with {{Feedforward Neural Networks}}},
  booktitle = {2018 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
  author = {Shen, Hao},
  date = {2018-06},
  pages = {811--820},
  publisher = {{IEEE}},
  location = {{Salt Lake City, UT}},
  doi = {10.1109/CVPR.2018.00091},
  url = {https://ieeexplore.ieee.org/document/8578189/},
  urldate = {2023-12-11},
  abstract = {Training deep neural networks for solving machine learning problems is one great challenge in the field, mainly due to its associated optimisation problem being highly non-convex. Recent developments have suggested that many training algorithms do not suffer from undesired local minima under certain scenario, and consequently led to great efforts in pursuing mathematical explanations for such observations. This work provides an alternative mathematical understanding of the challenge from a smooth optimisation perspective. By assuming exact learning of finite samples, sufficient conditions are identified via a critical point analysis to ensure any local minimum to be globally minimal as well. Furthermore, a state of the art algorithm, known as the Generalised Gauss-Newton (GGN) algorithm, is rigorously revisited as an approximate Newton’s algorithm, which shares the property of being locally quadratically convergent to a global minimum under the condition of exact learning.},
  eventtitle = {2018 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  isbn = {978-1-5386-6420-9},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/UFGAKAJ2/Shen - 2018 - Towards a Mathematical Understanding of the Diffic.pdf}
}

@article{shortSideeffectsAssociatedKetamine2018,
  title = {Side-Effects Associated with Ketamine Use in Depression: A Systematic Review},
  shorttitle = {Side-Effects Associated with Ketamine Use in Depression},
  author = {Short, Brooke and Fong, Joanna and Galvez, Veronica and Shelker, William and Loo, Colleen K},
  date = {2018-01},
  journaltitle = {The Lancet Psychiatry},
  shortjournal = {The Lancet Psychiatry},
  volume = {5},
  number = {1},
  pages = {65--78},
  issn = {22150366},
  doi = {10.1016/S2215-0366(17)30272-9},
  url = {https://linkinghub.elsevier.com/retrieve/pii/S2215036617302729},
  urldate = {2023-09-21},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/5Z9STYPS/Short et al. - 2018 - Side-effects associated with ketamine use in depre.pdf}
}

@online{simonyanVeryDeepConvolutional2015,
  title = {Very {{Deep Convolutional Networks}} for {{Large-Scale Image Recognition}}},
  author = {Simonyan, Karen and Zisserman, Andrew},
  date = {2015-04-10},
  eprint = {1409.1556},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/1409.1556},
  urldate = {2023-11-26},
  abstract = {In this work we investigate the effect of the convolutional network depth on its accuracy in the large-scale image recognition setting. Our main contribution is a thorough evaluation of networks of increasing depth using an architecture with very small (3 × 3) convolution filters, which shows that a significant improvement on the prior-art configurations can be achieved by pushing the depth to 16–19 weight layers. These findings were the basis of our ImageNet Challenge 2014 submission, where our team secured the first and the second places in the localisation and classification tracks respectively. We also show that our representations generalise well to other datasets, where they achieve state-of-the-art results. We have made our two best-performing ConvNet models publicly available to facilitate further research on the use of deep visual representations in computer vision.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  file = {/Users/ryedida/Zotero/storage/Y8BTUYJV/Simonyan and Zisserman - 2015 - Very Deep Convolutional Networks for Large-Scale I.pdf}
}

@online{singhalLargeLanguageModels2022,
  title = {Large {{Language Models Encode Clinical Knowledge}}},
  author = {Singhal, Karan and Azizi, Shekoofeh and Tu, Tao and Mahdavi, S. Sara and Wei, Jason and Chung, Hyung Won and Scales, Nathan and Tanwani, Ajay and Cole-Lewis, Heather and Pfohl, Stephen and Payne, Perry and Seneviratne, Martin and Gamble, Paul and Kelly, Chris and Scharli, Nathaneal and Chowdhery, Aakanksha and Mansfield, Philip and family=Arcas, given=Blaise Aguera, prefix=y, useprefix=false and Webster, Dale and Corrado, Greg S. and Matias, Yossi and Chou, Katherine and Gottweis, Juraj and Tomasev, Nenad and Liu, Yun and Rajkomar, Alvin and Barral, Joelle and Semturs, Christopher and Karthikesalingam, Alan and Natarajan, Vivek},
  date = {2022-12-26},
  eprint = {2212.13138},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2212.13138},
  urldate = {2023-10-06},
  abstract = {Large language models (LLMs) have demonstrated impressive capabilities in natural language understanding and generation, but the quality bar for medical and clinical applications is high. Today, attempts to assess models' clinical knowledge typically rely on automated evaluations on limited benchmarks. There is no standard to evaluate model predictions and reasoning across a breadth of tasks. To address this, we present MultiMedQA, a benchmark combining six existing open question answering datasets spanning professional medical exams, research, and consumer queries; and HealthSearchQA, a new free-response dataset of medical questions searched online. We propose a framework for human evaluation of model answers along multiple axes including factuality, precision, possible harm, and bias. In addition, we evaluate PaLM (a 540-billion parameter LLM) and its instruction-tuned variant, Flan-PaLM, on MultiMedQA. Using a combination of prompting strategies, Flan-PaLM achieves state-of-the-art accuracy on every MultiMedQA multiple-choice dataset (MedQA, MedMCQA, PubMedQA, MMLU clinical topics), including 67.6\% accuracy on MedQA (US Medical License Exam questions), surpassing prior state-of-the-art by over 17\%. However, human evaluation reveals key gaps in Flan-PaLM responses. To resolve this we introduce instruction prompt tuning, a parameter-efficient approach for aligning LLMs to new domains using a few exemplars. The resulting model, Med-PaLM, performs encouragingly, but remains inferior to clinicians. We show that comprehension, recall of knowledge, and medical reasoning improve with model scale and instruction prompt tuning, suggesting the potential utility of LLMs in medicine. Our human evaluations reveal important limitations of today's models, reinforcing the importance of both evaluation frameworks and method development in creating safe, helpful LLM models for clinical applications.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Computation and Language},
  file = {/Users/ryedida/Zotero/storage/9TG4ZY9K/Singhal et al. - 2022 - Large Language Models Encode Clinical Knowledge.pdf}
}

@report{SLOTMACHINESDISCOVERING,
  title = {{{SLOT MACHINES}}: {{DISCOVERING WINNING COMBINA-TIONS OF RANDOM WEIGHTS IN NEURAL NETWORKS}}},
  abstract = {In contrast to traditional weight optimization in a continuous space, we demonstrate the existence of effective random networks whose weights are never updated. By selecting a weight among a fixed set of random values for each individual connection, our method uncovers combinations of random weights that match the performance of traditionally-trained networks of the same capacity. We refer to our networks as "slot machines" where each reel (connection) contains a fixed set of symbols (random values). Our backpropagation algorithm "spins" the reels to seek "winning" combinations, i.e., selections of random weight values that minimize the given loss. Quite surprisingly, we find that allocating just a few random values to each connection (e.g., 8 values per connection) yields highly competitive combinations despite being dramatically more constrained compared to traditionally learned weights. Moreover, finetuning these combinations often improves performance over the trained baselines. A randomly initialized VGG-19 with 8 values per connection contains a combination that achieves 90\% test accuracy on CIFAR-10. Our method also achieves an impressive performance of 98.1\% on MNIST for neural networks containing only random weights.}
}

@unpublished{Smith2017,
  title = {Super-{{Convergence}}: {{Very Fast Training}} of {{Neural Networks Using Large Learning Rates}}},
  author = {Smith, Leslie N. and Topin, Nicholay},
  date = {2017},
  eprint = {1708.07120},
  eprinttype = {arxiv},
  pages = {1--18},
  doi = {arXiv:1708.07120v3},
  url = {http://arxiv.org/abs/1708.07120},
  abstract = {In this paper, we describe a phenomenon, which we named "super-convergence", where neural networks can be trained an order of magnitude faster than with standard training methods. The existence of super-convergence is relevant to understanding why deep networks generalize well. One of the key elements of super-convergence is training with one learning rate cycle and a large maximum learning rate. A primary insight that allows super-convergence training is that large learning rates regularize the training, hence requiring a reduction of all other forms of regularization in order to preserve an optimal regularization balance. We also derive a simplification of the Hessian Free optimization method to compute an estimate of the optimal learning rate. Experiments demonstrate super-convergence for Cifar-10/100, MNIST and Imagenet datasets, and resnet, wide-resnet, densenet, and inception architectures. In addition, we show that super-convergence provides a greater boost in performance relative to standard training when the amount of labeled training data is limited. The architectures and code to replicate the figures in this paper are available at github.com/lnsmith54/super-convergence. See http://www.fast.ai/2018/04/30/dawnbench-fastai/ for an application of super-convergence to win the DAWNBench challenge (see https://dawn.cs.stanford.edu/benchmark/).}
}

@article{Smith2017a,
  title = {Cyclical Learning Rates for Training Neural Networks},
  author = {Smith, Leslie N.},
  date = {2017},
  journaltitle = {Proceedings - 2017 IEEE Winter Conference on Applications of Computer Vision, WACV 2017},
  eprint = {1506.01186},
  eprinttype = {arxiv},
  pages = {464--472},
  issn = {15635147},
  doi = {10.1109/WACV.2017.58},
  abstract = {It is known that the learning rate is the most important hyper-parameter to tune for training deep neural networks. This paper describes a new method for setting the learning rate, named cyclical learning rates, which practically eliminates the need to experimentally find the best values and schedule for the global learning rates. Instead of monotonically decreasing the learning rate, this method lets the learning rate cyclically vary between reasonable boundary values. Training with cyclical learning rates instead of fixed values achieves improved classification accuracy without a need to tune and often in fewer iterations. This paper also describes a simple way to estimate "reasonable bounds" -- linearly increasing the learning rate of the network for a few epochs. In addition, cyclical learning rates are demonstrated on the CIFAR-10 and CIFAR-100 datasets with ResNets, Stochastic Depth networks, and DenseNets, and the ImageNet dataset with the AlexNet and GoogLeNet architectures. These are practical tools for everyone who trains neural networks.},
  isbn = {9781509048229},
  issue = {April}
}

@unpublished{smithDisciplinedApproachNeural2018,
  title = {A {{Disciplined Approach}} to {{Neural Network Hyper-Parameters}}: {{Part}} 1 - {{Learning Rate}}, {{Batch Size}}, {{Momentum}}, and {{Weight Decay}}},
  author = {Smith, Leslie},
  date = {2018},
  eprint = {1803.09820v2},
  eprinttype = {arxiv},
  abstract = {Although deep learning has produced dazzling successes for applications of im- age, speech, and video processing in the past few years, most trainings are with suboptimal hyper-parameters, requiring unnecessarily long training times. Setting the hyper-parameters remains a black art that requires years of experience to ac- quire. This report proposes several efficient ways to set the hyper-parameters that significantly reduce training time and improves performance. Specifically, this report shows how to examine the training validation/test loss function for subtle clues of underfitting and overfitting and suggests guidelines for moving toward the optimal balance point. Then it discusses how to increase/decrease the learning rate/momentum to speed up training. Our experiments show that it is crucial to balance every manner of regularization for each dataset and architecture. Weight decay is used as a sample regularizer to show how its optimal value is tightly coupled with the learning rates and momentum. Files to help replicate the results reported here are available at https://github.com/lnsmith54/hyperParam1.},
  file = {/Users/ryedida/Zotero/storage/8TZLTLPP/Smith - 2018 - A Disciplined Approach to Neural Network Hyper-Parameters Part 1 - Learning Rate, Batch Size, Momentum, and Weight Dec(2).pdf}
}

@article{snoek2012practical,
  title = {Practical Bayesian Optimization of Machine Learning Algorithms},
  author = {Snoek, Jasper and Larochelle, Hugo and Adams, Ryan P},
  date = {2012},
  journaltitle = {Advances in neural information processing systems},
  volume = {25}
}

@inproceedings{snoek2015scalable,
  title = {Scalable Bayesian Optimization Using Deep Neural Networks},
  booktitle = {International Conference on Machine Learning},
  author = {Snoek, Jasper and Rippel, Oren and Swersky, Kevin and Kiros, Ryan and Satish, Nadathur and Sundaram, Narayanan and Patwary, Mostofa and Prabhat, Mr and Adams, Ryan},
  date = {2015},
  pages = {2171--2180},
  publisher = {{PMLR}}
}

@article{Song2018,
  title = {{{PixelDefend}}: {{Leveraging}} Generative Models to Understand and Defend against Adversarial Examples},
  author = {Song, Yang and Nowozin, Sebastian and Kushman, Nate and Kim, Taesup and Ermon, Stefano},
  date = {2018},
  journaltitle = {6th International Conference on Learning Representations, ICLR 2018 - Conference Track Proceedings},
  eprint = {1710.10766},
  eprinttype = {arxiv},
  pages = {1--20},
  abstract = {Adversarial perturbations of normal images are usually imperceptible to humans, but they can seriously confuse state-of-the-art machine learning models. What makes them so special in the eyes of image classifiers? In this paper, we show empirically that adversarial examples mainly lie in the low probability regions of the training distribution, regardless of attack types and targeted models. Using statistical hypothesis testing, we find that modern neural density models are surprisingly good at detecting imperceptible image perturbations. Based on this discovery, we devised PixelDefend, a new approach that purifies a maliciously perturbed image by moving it back towards the distribution seen in the training data. The purified image is then run through an unmodified classifier, making our method agnostic to both the classifier and the attacking method. As a result, PixelDefend can be used to protect already deployed models and be combined with other model-specific defenses. Experiments show that our method greatly improves resilience across a wide variety of state-of-the-art attacking methods, increasing accuracy on the strongest attack from 63\% to 84\% for Fashion MNIST and from 32\% to 70\% for CIFAR-10.},
  file = {/Users/ryedida/Zotero/storage/RPTM7DXK/Song et al. - 2018 - PixelDefend Leveraging generative models to understand and defend against adversarial examples(2).pdf}
}

@report{Spaulding,
  title = {Affect-{{Aware Student Models}} for {{Robot Tutors}}},
  author = {Spaulding, Samuel and Gordon, Goren and Breazeal, Cynthia},
  url = {www.ifaamas.org},
  abstract = {Computational tutoring systems, such as educational software or interactive robots, have the potential for great societal benefit. Such systems track and assess students' knowledge via inferential methods , such as the popular Bayesian Knowledge Tracing (BKT) algorithm. However, these methods do not typically draw on the affec-tive signals that human teachers use to assess knowledge, such as indications of discomfort, engagement, or frustration. In this paper we present a novel extension to the BKT model that uses affective data, derived autonomously from video records of children playing an interactive story-telling game with a robot, to infer student knowledge of reading skills. We find that, compared to a control group of children who played the game with only a tablet, children who interacted with an embodied social robot generated stronger affective data signals of engagement and enjoyment during the interaction. We then show that incorporating this affec-tive data into model training improves the quality of the learned knowledge inference models. These results suggest that physically embodied, affect-aware robot tutors can provide more effective and empathic educational experiences for children, and advance both algorithmic and human-centered motivations for further development of systems that tightly integrate affect understanding and complex models of inference with interactive, educational robots.},
  keywords = {affective computing,child-robot interac-tion,educational robots,socially assistive robots},
  file = {/Users/ryedida/Zotero/storage/BTW52XE4/Spaulding, Gordon, Breazeal - Unknown - Affect-Aware Student Models for Robot Tutors(2).pdf}
}

@report{Stamp2018,
  title = {A {{Revealing Introduction}} to {{Hidden Markov Models}} 1 {{A}} Simple Example},
  author = {Stamp, Mark},
  date = {2018},
  abstract = {Suppose we want to determine the average annual temperature at a particular location on earth over a series of years. To make it interesting, suppose the years we are concerned with lie in the distant past, before thermometers were invented. Since we can't go back in time, we instead look for indirect evidence of the temperature. To simplify the problem, we only consider two annual temperatures, "hot" and "cold". Suppose that modern evidence indicates that the probability of a hot year followed by another hot year is 0.7 and the probability that a cold year is followed by another cold year is 0.6. We'll assume that these probabilities held in the distant past as well. The information so far can be summarized as H C H C 0.7 0.3 0.4 0.6 (1) where H is "hot" and C is "cold". Also suppose that current research indicates a correlation between the size of tree growth rings and temperature. For simplicity, we only consider three different tree ring sizes, small, medium and large, or S, M and L, respectively. Finally, suppose that based on available evidence, the probabilistic relationship between annual temperature and tree ring sizes is given by S M L H C 0.1 0.4 0.5 0.7 0.2 0.1. (2) * Email: mark.stamp@sjsu.edu-Note that this was originally published online in 2004. This version is essentially the same, but minor corrections and additions have been made over time, with new (and improved!) exercises added.},
  file = {/Users/ryedida/Zotero/storage/NNLR2T2Y/Stamp - 2018 - A Revealing Introduction to Hidden Markov Models 1 A simple example(2).pdf}
}

@article{stanley2002evolving,
  title = {Evolving Neural Networks through Augmenting Topologies},
  author = {Stanley, Kenneth O and Miikkulainen, Risto},
  date = {2002},
  journaltitle = {Evolutionary computation},
  volume = {10},
  number = {2},
  pages = {99--127},
  publisher = {{MIT Press}}
}

@article{steinerImpactDeepLearning2018,
  title = {Impact of {{Deep Learning Assistance}} on the {{Histopathologic Review}} of {{Lymph Nodes}} for {{Metastatic Breast Cancer}}},
  author = {Steiner, David F. and MacDonald, Robert and Liu, Yun and Truszkowski, Peter and Hipp, Jason D. and Gammage, Christopher and Thng, Florence and Peng, Lily and Stumpe, Martin C.},
  date = {2018-12},
  journaltitle = {The American Journal of Surgical Pathology},
  volume = {42},
  number = {12},
  pages = {1636--1646},
  issn = {0147-5185},
  doi = {10.1097/PAS.0000000000001151},
  url = {https://journals.lww.com/ajsp/Fulltext/2018/12000/Impact_of_Deep_Learning_Assistance_on_the.7.aspx},
  urldate = {2021-04-15},
  abstract = {Advances in the quality of whole-slide images have set the stage for the clinical use of digital images in anatomic pathology. Along with advances in computer image analysis, this raises the possibility for computer-assisted diagnostics in pathology to improve histopathologic interpretation and clinical care. To evaluate the potential impact of digital assistance on interpretation of digitized slides, we conducted a multireader multicase study utilizing our deep learning algorithm for the detection of breast cancer metastasis in lymph nodes. Six pathologists reviewed 70 digitized slides from lymph node sections in 2 reader modes, unassisted and assisted, with a wash-out period between sessions. In the assisted mode, the deep learning algorithm was used to identify and outline regions with high likelihood of containing tumor. Algorithm-assisted pathologists demonstrated higher accuracy than either the algorithm or the pathologist alone. In particular, algorithm assistance significantly increased the sensitivity of detection for micrometastases (91\% vs. 83\%, P=0.02). In addition, average review time per image was significantly shorter with assistance than without assistance for both micrometastases (61 vs. 116\,s, P=0.002) and negative images (111 vs. 137\,s, P=0.018). Lastly, pathologists were asked to provide a numeric score regarding the difficulty of each image classification. On the basis of this score, pathologists considered the image review of micrometastases to be significantly easier when interpreted with assistance (P=0.0005). Utilizing a proof of concept assistant tool, this study demonstrates the potential of a deep learning algorithm to improve pathologist accuracy and efficiency in a digital pathology workflow.},
  langid = {american},
  annotation = {122 citations (Semantic Scholar/DOI) [2021-04-15]},
  file = {/Users/ryedida/Zotero/storage/FNAJSSEL/Steiner et al. - 2018 - Impact of Deep Learning Assistance on the Histopat.pdf}
}

@article{Stutz2019,
  title = {Disentangling Adversarial Robustness and Generalization},
  author = {Stutz, David and Hein, Matthias and Schiele, Bernt},
  date = {2019},
  journaltitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition},
  volume = {2019-June},
  eprint = {1812.00740},
  eprinttype = {arxiv},
  pages = {6969--6980},
  issn = {10636919},
  doi = {10.1109/CVPR.2019.00714},
  abstract = {Obtaining deep networks that are robust against adversarial examples and generalize well is an open problem. A recent hypothesis even states that both robust and accurate models are impossible, i.e., adversarial robustness and generalization are conflicting goals. In an effort to clarify the relationship between robustness and generalization, we assume an underlying, low-dimensional data manifold and show that: 1. regular adversarial examples leave the manifold; 2. adversarial examples constrained to the manifold, i.e., on-manifold adversarial examples, exist; 3. on-manifold adversarial examples are generalization errors, and on-manifold adversarial training boosts generalization; 4. regular robustness and generalization are not necessarily contradicting goals. These assumptions imply that both robust and accurate models are possible. However, different models (architectures, training strategies etc.) can exhibit different robustness and generalization characteristics. To confirm our claims, we present extensive experiments on synthetic data (with known manifold) as well as on EMNIST, Fashion-MNIST and CelebA.},
  isbn = {9781728132938},
  keywords = {Datasets and Evaluation,Deep Learning,Representation Learning},
  file = {/Users/ryedida/Zotero/storage/C2Y83PCA/Stutz, Hein, Schiele - 2019 - Disentangling adversarial robustness and generalization(2).pdf}
}

@article{Su2018,
  title = {Is Robustness the Cost of Accuracy? – {{A}} Comprehensive Study on the Robustness of 18 Deep Image Classification Models},
  author = {Su, Dong and Zhang, Huan and Chen, Hongge and Yi, Jinfeng and Chen, Pin Yu and Gao, Yupeng},
  date = {2018},
  journaltitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  volume = {11216 LNCS},
  eprint = {1808.01688},
  eprinttype = {arxiv},
  pages = {644--661},
  issn = {16113349},
  doi = {10.1007/978-3-030-01258-8_39},
  abstract = {The prediction accuracy has been the long-lasting and sole standard for comparing the performance of different image classification models, including the ImageNet competition. However, recent studies have highlighted the lack of robustness in well-trained deep neural networks to adversarial examples. Visually imperceptible perturbations to natural images can easily be crafted and mislead the image classifiers towards misclassification. To demystify the trade-offs between robustness and accuracy, in this paper we thoroughly benchmark 18 ImageNet models using multiple robustness metrics, including the distortion, success rate and transferability of adversarial examples between 306 pairs of models. Our extensive experimental results reveal several new insights: (1) linear scaling law - the empirical ℓ2 and ℓ∞ distortion metrics scale linearly with the logarithm of classification error; (2) model architecture is a more critical factor to robustness than model size, and the disclosed accuracy-robustness Pareto frontier can be used as an evaluation criterion for ImageNet model designers; (3) for a similar network architecture, increasing network depth slightly improves robustness in ℓ∞ distortion; (4) there exist models (in VGG family) that exhibit high adversarial transferability, while most adversarial examples crafted from one model can only be transferred within the same family. Experiment code is publicly available at https://github.com/huanzhang12/Adversarial\_Survey.},
  isbn = {9783030012571},
  keywords = {Adversarial attacks,Deep neural networks,Robustness},
  file = {/Users/ryedida/Zotero/storage/GWIT225I/Su et al. - 2018 - Is robustness the cost of accuracy – A comprehensive study on the robustness of 18 deep image classification mod(2).pdf}
}

@unpublished{Suggala2018,
  title = {Revisiting {{Adversarial Risk}}},
  author = {Suggala, Arun Sai and Prasad, Adarsh and Nagarajan, Vaishnavh and Ravikumar, Pradeep},
  date = {2018},
  number = {2},
  eprint = {1806.02924},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1806.02924},
  abstract = {Recent works on adversarial perturbations show that there is an inherent trade-off between standard test accuracy and adversarial accuracy. Specifically, they show that no classifier can simultaneously be robust to adversarial perturbations and achieve high standard test accuracy. However, this is contrary to the standard notion that on tasks such as image classification, humans are robust classifiers with low error rate. In this work, we show that the main reason behind this confusion is the inexact definition of adversarial perturbation that is used in the literature. To fix this issue, we propose a slight, yet important modification to the existing definition of adversarial perturbation. Based on the modified definition, we show that there is no trade-off between adversarial and standard accuracies; there exist classifiers that are robust and achieve high standard accuracy. We further study several properties of this new definition of adversarial risk and its relation to the existing definition.}
}

@article{sunDexBERTEffectiveTaskAgnostic2023,
  title = {{{DexBERT}}: {{Effective}}, {{Task-Agnostic}} and {{Fine-grained Representation Learning}} of {{Android Bytecode}}},
  shorttitle = {{{DexBERT}}},
  author = {Sun, Tiezhu and Allix, Kevin and Kim, Kisub and Zhou, Xin and Kim, Dongsun and Lo, David and Bissyandé, Tegawendé F. and Klein, Jacques},
  date = {2023},
  journaltitle = {IEEE Transactions on Software Engineering},
  shortjournal = {IIEEE Trans. Software Eng.},
  pages = {1--16},
  issn = {0098-5589, 1939-3520, 2326-3881},
  doi = {10.1109/TSE.2023.3310874},
  url = {https://ieeexplore.ieee.org/document/10237047/},
  urldate = {2023-10-10},
  abstract = {The automation of an increasingly large number of software engineering tasks is becoming possible thanks to Machine Learning (ML). One foundational building block in the application of ML to software artifacts is the representation of these artifacts (e.g., source code or executable code) into a form that is suitable for learning. Traditionally, researchers and practitioners have relied on manually selected features, based on expert knowledge, for the task at hand. Such knowledge is sometimes imprecise and generally incomplete. To overcome this limitation, many studies have leveraged representation learning, delegating to ML itself the job of automatically devising suitable representations and selections of the most relevant features. Yet, in the context of Android problems, existing models are either limited to coarse-grained whole-app level (e.g., apk2vec) or conducted for one specific downstream task (e.g., smali2vec). Thus, the produced representation may turn out to be unsuitable for fine-grained tasks or cannot generalize beyond the task that they have been trained on. Our work is part of a new line of research that investigates effective, task-agnostic, and fine-grained universal representations of bytecode to mitigate both of these two limitations. Such representations aim to capture information relevant to various low-level downstream tasks (e.g., at the class-level). We are inspired by the field of Natural Language Processing, where the problem of universal representation was addressed by building Universal Language Models, such as BERT, whose goal is to capture abstract semantic information about sentences, in a way that is reusable for a variety of tasks. We propose DexBERT, a BERT-like Language Model dedicated to representing chunks of DEX bytecode, the main binary format used in Android applications. We empirically assess whether DexBERT is able to model the DEX language and evaluate the suitability of our model in three distinct classlevel software engineering tasks: Malicious Code Localization, Defect Prediction, and Component Type Classification. We also experiment with strategies to deal with the problem of catering to apps having vastly different sizes, and we demonstrate one example of using our technique to investigate what information is relevant to a given task.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/AQQG5H2U/Sun et al. - 2023 - DexBERT Effective, Task-Agnostic and Fine-grained.pdf}
}

@article{Sutskever2014,
  title = {Dropout : {{A Simple Way}} to {{Prevent Neural Networks}} from {{Overfitting}}},
  author = {Sutskever, Ilya and Hinton, Geoffrey and Krizhevsky, Alex and Salakhutdinov, Ruslan R},
  date = {2014},
  journaltitle = {Journal of Machine Learning Research},
  volume = {15},
  eprint = {23285570},
  eprinttype = {pmid},
  pages = {1929--1958},
  issn = {15337928},
  doi = {10.1214/12-AOS1000},
  abstract = {Deep neural nets with a large number of parameters are very powerful machine learning systems. However, overfitting is a serious problem in such networks. Large networks are also slow to use, making it difficult to deal with overfitting by combining the predictions of many different large neural nets at test time. Dropout is a technique for addressing this problem. The key idea is to randomly drop units (along with their connections) from the neural network during training. This prevents units from co-adapting too much. During training, dropout samples from an exponential number of different " thinned " networks. At test time, it is easy to approximate the effect of averaging the predictions of all these thinned networks by simply using a single unthinned network that has smaller weights. This significantly reduces overfitting and gives major improvements over other regularization methods. We show that dropout improves the performance of neural networks on supervised learning tasks in vision, speech recognition, document classification and computational biology, obtaining state-of-the-art results on many benchmark data sets.},
  isbn = {1532-4435},
  keywords = {deep learning,model combination,neural networks,regularization},
  file = {/Users/ryedida/Zotero/storage/4XT9K2N8/Sutskever et al. - 2014 - Dropout A Simple Way to Prevent Neural Networks from Overfitting(2).pdf}
}

@article{sutskeverImportanceInitializationMomentum,
  title = {On the Importance of Initialization and Momentum in Deep Learning},
  author = {Sutskever, Ilya and Martens, James and Dahl, George},
  abstract = {Deep and recurrent neural networks (DNNs and RNNs respectively) are powerful models that were considered to be almost impossible to train using stochastic gradient descent with momentum. In this paper, we show that when stochastic gradient descent with momentum uses a well-designed random initialization and a particular type of slowly increasing schedule for the momentum parameter, it can train both DNNs and RNNs (on datasets with long-term dependencies) to levels of performance that were previously achievable only with Hessian-Free optimization. We find that both the initialization and the momentum are crucial since poorly initialized networks cannot be trained with momentum and well-initialized networks perform markedly worse when the momentum is absent or poorly tuned.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/RYY9M2IW/Sutskever et al. - On the importance of initialization and momentum i.pdf}
}

@unpublished{swersky2014freeze,
  title = {Freeze-Thaw Bayesian Optimization},
  author = {Swersky, Kevin and Snoek, Jasper and Adams, Ryan Prescott},
  date = {2014},
  eprint = {1406.3896},
  eprinttype = {arxiv}
}

@article{Szegedy2014,
  title = {Intriguing Properties of Neural Networks},
  author = {Szegedy, Christian and Zaremba, Wojciech and Sutskever, Ilya and Bruna, Joan and Erhan, Dumitru and Goodfellow, Ian and Fergus, Rob},
  date = {2014},
  journaltitle = {2nd International Conference on Learning Representations, ICLR 2014 - Conference Track Proceedings},
  eprint = {1312.6199},
  eprinttype = {arxiv},
  pages = {1--10},
  abstract = {Deep neural networks are highly expressive models that have recently achieved state of the art performance on speech and visual recognition tasks. While their expressiveness is the reason they succeed, it also causes them to learn uninterpretable solutions that could have counter-intuitive properties. In this paper we report two such properties. First, we find that there is no distinction between individual high level units and random linear combinations of high level units, according to various methods of unit analysis. It suggests that it is the space, rather than the individual units, that contains the semantic information in the high layers of neural networks. Second, we find that deep neural networks learn input-output mappings that are fairly discontinuous to a significant extent. We can cause the network to misclassify an image by applying a certain hardly perceptible perturbation, which is found by maximizing the network’s prediction error. In addition, the specific nature of these perturbations is not a random artifact of learning: the same perturbation can cause a different network, that was trained on a different subset of the dataset, to misclassify the same input.}
}

@article{Szubert2019,
  title = {Structure-Preserving Visualisation of High Dimensional Single-Cell Datasets},
  author = {Szubert, Benjamin and Cole, Jennifer E. and Monaco, Claudia and Drozdov, Ignat},
  date = {2019},
  journaltitle = {Scientific Reports},
  volume = {9},
  number = {1},
  pages = {1--10},
  issn = {20452322},
  doi = {10.1038/s41598-019-45301-0},
  abstract = {Single-cell technologies offer an unprecedented opportunity to effectively characterize cellular heterogeneity in health and disease. Nevertheless, visualisation and interpretation of these multi-dimensional datasets remains a challenge. We present a novel framework, ivis, for dimensionality reduction of single-cell expression data. ivis utilizes a siamese neural network architecture that is trained using a novel triplet loss function. Results on simulated and real datasets demonstrate that ivis preserves global data structures in a low-dimensional space, adds new data points to existing embeddings using a parametric mapping function, and scales linearly to hundreds of thousands of cells. ivis is made publicly available through Python and R interfaces on https://github.com/beringresearch/ivis.},
  file = {/Users/ryedida/Zotero/storage/ESI7YM9R/Szubert et al. - 2019 - Structure-preserving visualisation of high dimensional single-cell datasets(2).pdf}
}

@article{tambwekarEstimationApplicationsQuantiles2021,
  title = {Estimation and {{Applications}} of {{Quantiles}} in {{Deep Binary Classification}}},
  author = {Tambwekar, Anuj and Maiya, Anirudh and Dhavala, Soma S and Saha, Snehanshu},
  date = {2021},
  journaltitle = {IEEE Transactions on Artificial Intelligence},
  pages = {1--1},
  issn = {2691-4581},
  doi = {10.1109/TAI.2021.3115078},
  abstract = {Conditional quantiles obtained via regression are used as a robust alternative to classical conditional means in Econometrics and Statistics, as they can capture the uncertainty in a prediction, and model tail behaviours, while making very few distributional assumptions. In this work, we extend the notion of conditional quantiles to the binary classification setting - allowing us to quantify the uncertainty in the predictions, increase resilience to label noise, and provide new insights into the functions learnt by the models. We accomplish this by defining a new loss called Binary Quantile Regression Loss. We compute the Lipschitz constant of the proposed loss and show that its curvature is bounded under some regularity conditions. These properties are later used to characterize the error rates of the learning algorithms and to accelerate the training regime with using Lipschitz Adaptive Learning Rates. We leverage the estimated quantiles to obtain individualized confidence scores that provide an accurate measure of a prediction being misclassified. We aggregate these scores to provide two additional metrics, namely, confidence score and retention rate, which can be used to withhold decisions and increase model accuracy. We also study the robustness of the proposed non-parametric binary quantile classification framework, and finally, we demonstrate that quantiles aid in explainability as they can be used to obtain several univariate summary statistics that can be directly applied to existing explanation tools.},
  eventtitle = {{{IEEE Transactions}} on {{Artificial Intelligence}}},
  keywords = {Artificial intelligence,Backpropagation,Computational modeling,Deep learning,Interpretability,Predictive models,Predictive Models,Robust Predictions,Robustness,Task analysis,Uncertainty,Uncertainty Quantification},
  annotation = {0 citations (Semantic Scholar/DOI) [2021-12-04]},
  file = {/Users/ryedida/Zotero/storage/VZAD3BI9/Tambwekar et al. - 2021 - Estimation and Applications of Quantiles in Deep B.pdf;/Users/ryedida/Zotero/storage/8IA65Q78/9548806.html}
}

@unpublished{Tanay2016,
  title = {A {{Boundary Tilting Persepective}} on the {{Phenomenon}} of {{Adversarial Examples}}},
  author = {Tanay, Thomas and Griffin, Lewis},
  date = {2016},
  eprint = {1608.07690},
  eprinttype = {arxiv},
  pages = {1--20},
  url = {http://arxiv.org/abs/1608.07690},
  abstract = {Deep neural networks have been shown to suffer from a surprising weakness: their classification outputs can be changed by small, non-random perturbations of their inputs. This adversarial example phenomenon has been explained as originating from deep networks being "too linear" (Goodfellow et al., 2014). We show here that the linear explanation of adversarial examples presents a number of limitations: the formal argument is not convincing, linear classifiers do not always suffer from the phenomenon, and when they do their adversarial examples are different from the ones affecting deep networks. We propose a new perspective on the phenomenon. We argue that adversarial examples exist when the classification boundary lies close to the submanifold of sampled data, and present a mathematical analysis of this new perspective in the linear case. We define the notion of adversarial strength and show that it can be reduced to the deviation angle between the classifier considered and the nearest centroid classifier. Then, we show that the adversarial strength can be made arbitrarily high independently of the classification performance due to a mechanism that we call boundary tilting. This result leads us to defining a new taxonomy of adversarial examples. Finally, we show that the adversarial strength observed in practice is directly dependent on the level of regularisation used and the strongest adversarial examples, symptomatic of overfitting, can be avoided by using a proper level of regularisation.},
  file = {/Users/ryedida/Zotero/storage/6JDDDMWF/Tanay, Griffin - 2016 - A Boundary Tilting Persepective on the Phenomenon of Adversarial Examples(2).pdf}
}

@inproceedings{tang1999empirical,
  title = {An Empirical Study on Object-Oriented Metrics},
  booktitle = {Proceedings Sixth International Software Metrics Symposium ({{Cat}}. {{No}}. {{PR00403}})},
  author = {Tang, Mei-Huei and Kao, Ming-Hung and Chen, Mei-Hwa},
  date = {1999},
  pages = {242--249},
  publisher = {{IEEE}}
}

@article{Tang2018,
  title = {Personalized {{Top-N Sequential Recommendation}} via {{Convolutional Sequence Embedding}}},
  author = {Tang, Jiaxi and Wang, Ke},
  date = {2018},
  journaltitle = {Proceedings of the Eleventh ACM International Conference on Web Search and Data Mining  - WSDM '18},
  eprint = {12756117},
  eprinttype = {pmid},
  pages = {565--573},
  issn = {16130073},
  doi = {10.1145/3159652.3159656},
  url = {http://dl.acm.org/citation.cfm?doid=3159652.3159656},
  abstract = {Top-N sequential recommendation models each user as a sequence of items interacted in the past and aims to predict top-N ranked items that a user will likely interact in a " near future " . The order of interaction implies that sequential patterns play an important role where more recent items in a sequence have a larger impact on the next item. In this paper, we propose a Convolutional Sequence Embedding Recommendation Model (Caser) as a solution to address this requirement. The idea is to embed a sequence of recent items into an " image " in the time and latent spaces and learn sequential patterns as local features of the image using convolutional llters. This approach provides a uniied and dexible network structure for capturing both general preferences and sequential patterns. The ex-periments on public data sets demonstrated that Caser consistently outperforms state-of-the-art sequential recommendation methods on a variety of common evaluation metrics.},
  isbn = {9781450355810},
  keywords = {-  Information systems  -{$>$}  Retrieval models and r,convolutional neural,recommender system,sequential prediction}
}

@inproceedings{Tantithamthavorn16,
  title = {Automated Parameter Optimization of Classification Techniques for Defect Prediction Models},
  booktitle = {2016 {{IEEE}}/{{ACM}} 38th International Conference on Software Engineering ({{ICSE}})},
  author = {Tantithamthavorn, Chakkrit and McIntosh, Shane and Hassan, Ahmed E. and Matsumoto, Kenichi},
  date = {2016},
  pages = {321--332},
  doi = {10.1145/2884781.2884857}
}

@inproceedings{thornton2013auto,
  title = {Auto-{{WEKA}}: {{Combined}} Selection and Hyperparameter Optimization of Classification Algorithms},
  booktitle = {Proceedings of the 19th {{ACM SIGKDD}} International Conference on {{Knowledge}} Discovery and Data Mining},
  author = {Thornton, Chris and Hutter, Frank and Hoos, Holger H and Leyton-Brown, Kevin},
  date = {2013},
  pages = {847--855}
}

@article{Tjeng2019,
  title = {Evaluating Robustness of Neural Networks with Mixed Integer Programming},
  author = {Tjeng, Vincent and Xiao, Kai and Tedrake, Russ},
  date = {2019},
  journaltitle = {7th International Conference on Learning Representations, ICLR 2019},
  eprint = {1711.07356},
  eprinttype = {arxiv},
  pages = {1--21},
  abstract = {Neural networks trained only to optimize for training accuracy can often be fooled by adversarial examples - slightly perturbed inputs misclassified with high confidence. Verification of networks enables us to gauge their vulnerability to such adversarial examples. We formulate verification of piecewise-linear neural networks as a mixed integer program. On a representative task of finding minimum adversarial distortions, our verifier is two to three orders of magnitude quicker than the state-of-the-art. We achieve this computational speedup via tight formulations for non-linearities, as well as a novel presolve algorithm that makes full use of all information available. The computational speedup allows us to verify properties on convolutional and residual networks with over 100,000 ReLUs - several orders of magnitude more than networks previously verified by any complete verifier. In particular, we determine for the first time the exact adversarial accuracy of an MNIST classifier to perturbations with bounded l∞ norm = 0.1: for this classifier, we find an adversarial example for 4.38\% of samples, and a certificate of robustness to norm-bounded perturbations for the remainder. Across all robust training procedures and network architectures considered, and for both the MNIST and CIFAR-10 datasets, we are able to certify more samples than the state-of-the-art and find more adversarial examples than a strong first-order attack.},
  file = {/Users/ryedida/Zotero/storage/CMKZ2QYS/Tjeng, Xiao, Tedrake - 2019 - Evaluating robustness of neural networks with mixed integer programming(2).pdf}
}

@unpublished{tolstikhinMLPMixerAllMLPArchitecture2021,
  title = {{{MLP-Mixer}}: {{An}} All-{{MLP Architecture}} for {{Vision}}},
  shorttitle = {{{MLP-Mixer}}},
  author = {Tolstikhin, Ilya and Houlsby, Neil and Kolesnikov, Alexander and Beyer, Lucas and Zhai, Xiaohua and Unterthiner, Thomas and Yung, Jessica and Keysers, Daniel and Uszkoreit, Jakob and Lucic, Mario and Dosovitskiy, Alexey},
  date = {2021-05-04},
  eprint = {2105.01601},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2105.01601},
  urldate = {2021-05-08},
  abstract = {Convolutional Neural Networks (CNNs) are the go-to model for computer vision. Recently, attention-based networks, such as the Vision Transformer, have also become popular. In this paper we show that while convolutions and attention are both sufficient for good performance, neither of them are necessary. We present MLP-Mixer, an architecture based exclusively on multi-layer perceptrons (MLPs). MLP-Mixer contains two types of layers: one with MLPs applied independently to image patches (i.e. "mixing" the per-location features), and one with MLPs applied across patches (i.e. "mixing" spatial information). When trained on large datasets, or with modern regularization schemes, MLP-Mixer attains competitive scores on image classification benchmarks, with pre-training and inference cost comparable to state-of-the-art models. We hope that these results spark further research beyond the realms of well established CNNs and Transformers.},
  langid = {english},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
  annotation = {0 citations (Semantic Scholar/arXiv) [2021-05-08]},
  file = {/Users/ryedida/Zotero/storage/HLQMDRN6/Tolstikhin et al. - 2021 - MLP-Mixer An all-MLP Architecture for Vision.pdf}
}

@inproceedings{tomassi2021real,
  title = {On the Real-World Effectiveness of Static Bug Detectors at Finding Null Pointer Exceptions},
  booktitle = {2021 36th {{IEEE}}/{{ACM}} International Conference on Automated Software Engineering ({{ASE}})},
  author = {Tomassi, David A and Rubio-González, Cindy},
  date = {2021},
  pages = {292--303},
  publisher = {{IEEE}}
}

@article{tramerAdaptiveAttacksAdversarial,
  title = {On {{Adaptive Attacks}} to {{Adversarial Example Defenses}}},
  author = {Tramèr, Florian and Carlini, Nicholas and Brendel, Wieland and Ma, Aleksander},
  pages = {13},
  abstract = {Adaptive attacks have (rightfully) become the de facto standard for evaluating defenses to adversarial examples. We find, however, that typical adaptive evaluations are incomplete. We demonstrate that thirteen defenses recently published at ICLR, ICML and NeurIPS—and which illustrate a diverse set of defense strategies—can be circumvented despite attempting to perform evaluations using adaptive attacks. While prior evaluation papers focused mainly on the end result—showing that a defense was ineffective—this paper focuses on laying out the methodology and the approach necessary to perform an adaptive attack. Some of our attack strategies are generalizable, but no single strategy would have been sufficient for all defenses. This underlines our key message that adaptive attacks cannot be automated and always require careful and appropriate tuning to a given defense. We hope that these analyses will serve as guidance on how to properly perform adaptive attacks against defenses to adversarial examples, and thus will allow the community to make further progress in building more robust models.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/I7SDI8EB/Tramèr et al. - On Adaptive Attacks to Adversarial Example Defense.pdf}
}

@article{Tsipras2019,
  title = {Robustness May Be at Odds with Accuracy},
  author = {Tsipras, Dimitris and Santurkar, Shibani and Engstrom, Logan and Turner, Alexander and Madry, Aleksander},
  date = {2019},
  journaltitle = {7th International Conference on Learning Representations, ICLR 2019},
  eprint = {1805.12152},
  eprinttype = {arxiv},
  pages = {1--24},
  abstract = {We show that there exists an inherent tension between the goal of adversarial robustness and that of standard generalization. Specifically, training robust models may not only be more resource-consuming, but also lead to a reduction of standard accuracy. We demonstrate that this trade-off between the standard accuracy of a model and its robustness to adversarial perturbations provably exists even in a fairly simple and natural setting. These findings also corroborate a similar phenomenon observed in practice. Further, we argue that this phenomenon is a consequence of robust classifiers learning fundamentally different feature representations than standard classifiers. These differences, in particular, seem to result in unexpected benefits: the features learned by robust models tend to align better with salient data characteristics and human perception.},
  file = {/Users/ryedida/Zotero/storage/33FLPLHP/Tsipras et al. - 2019 - Robustness may be at odds with accuracy(2).pdf}
}

@article{tsitsiklisAnalysisTemporaldifferenceLearning1997,
  title = {An Analysis of Temporal-Difference Learning with Function Approximation},
  author = {Tsitsiklis, J.N. and Van Roy, B.},
  date = {1997-05},
  journaltitle = {IEEE Transactions on Automatic Control},
  shortjournal = {IEEE Trans. Automat. Contr.},
  volume = {42},
  number = {5},
  pages = {674--690},
  issn = {00189286},
  doi = {10.1109/9.580874},
  url = {http://ieeexplore.ieee.org/document/580874/},
  urldate = {2023-12-07},
  abstract = {We discuss the temporal-difference learning algorithm, as applied to approximating the cost-to-go function of an infinite-horizon discounted Markov chain. The algorithm we analyze updates parameters of a linear function approximator online during a single endless trajectory of an irreducible aperiodic Markov chain with a finite or infinite state space. We present a proof of convergence (with probability one), a characterization of the limit of convergence, and a bound on the resulting approximation error. Furthermore, our analysis is based on a new line of reasoning that provides new intuition about the dynamics of temporal-difference learning.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/5GW5PV47/Tsitsiklis and Van Roy - 1997 - An analysis of temporal-difference learning with f.pdf}
}

@article{Tung2018,
  title = {Deep {{Neural Network Compression}} by {{In-Parallel Pruning-Quantization}}},
  author = {Tung, Frederick and Mori, Greg},
  date = {2018},
  journaltitle = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  volume = {PP},
  number = {c},
  pages = {1--1},
  publisher = {{IEEE}},
  issn = {0162-8828},
  doi = {10.1109/TPAMI.2018.2886192},
  url = {https://ieeexplore.ieee.org/document/8573867/},
  isbn = {9780769539676},
  file = {/Users/ryedida/Zotero/storage/TGVK2GRK/Tung, Mori - 2018 - Deep Neural Network Compression by In-Parallel Pruning-Quantization(2).pdf}
}

@inproceedings{turner2021bayesian,
  title = {Bayesian Optimization Is Superior to Random Search for Machine Learning Hyperparameter Tuning: {{Analysis}} of the Black-Box Optimization Challenge 2020},
  booktitle = {{{NeurIPS}} 2020 Competition and Demonstration Track},
  author = {Turner, Ryan and Eriksson, David and McCourt, Michael and Kiili, Juha and Laaksonen, Eero and Xu, Zhen and Guyon, Isabelle},
  date = {2021},
  pages = {3--26},
  publisher = {{PMLR}}
}

@article{Uesato2019,
  title = {Rigorous Agent Evaluation: {{An}} Adversarial Approach to Uncover Catastrophic Failures},
  author = {Uesato, Jonathan and Kumar, Ananya and Szepesvari, Csaba and Erez, Tom and Anderson, Avraham Ruderman Keith and Dvijotham, Krishmamurthy and Heess, Nicolas and Kohli, Pushmeet},
  date = {2019},
  journaltitle = {7th International Conference on Learning Representations, ICLR 2019},
  eprint = {1812.01647},
  eprinttype = {arxiv},
  abstract = {This paper addresses the problem of evaluating learning systems in safety critical domains such as autonomous driving, where failures can have catastrophic consequences. We focus on two problems: searching for scenarios when learned agents fail and assessing their probability of failure. The standard method for agent evaluation in reinforcement learning, Vanilla Monte Carlo, can miss failures entirely, leading to the deployment of unsafe agents. We demonstrate this is an issue for current agents, where even matching the compute used for training is sometimes insufficient for evaluation. To address this shortcoming, we draw upon the rare event probability estimation literature and propose an adversarial evaluation approach. Our approach focuses evaluation on adversarially chosen situations, while still providing unbiased estimates of failure probabilities. The key difficulty is in identifying these adversarial situations - since failures are rare there is little signal to drive optimization. To solve this we propose a continuation approach that learns failure modes in related but less robust agents. Our approach also allows reuse of data already collected for training the agent. We demonstrate the efficacy of adversarial evaluation on two standard domains: humanoid control and simulated driving. Experimental results show that our methods can find catastrophic failures and estimate failures rates of agents multiple orders of magnitude faster than standard evaluation schemes, in minutes to hours rather than days.},
  file = {/Users/ryedida/Zotero/storage/USNFL2YJ/Uesato et al. - 2019 - Rigorous agent evaluation An adversarial approach to uncover catastrophic failures(2).pdf}
}

@article{uesatoAdversarialRiskDangers,
  title = {Adversarial {{Risk}} and the {{Dangers}} of {{Evaluating Against Weak Attacks}}},
  author = {Uesato, Jonathan and O’Donoghue, Brendan},
  abstract = {This paper investigates recently proposed approaches for defending against adversarial examples and evaluating adversarial robustness. We motivate adversarial risk as an objective for achieving models robust to worst-case inputs. We then frame commonly used attacks and evaluation metrics as defining a tractable surrogate objective to the true adversarial risk. This suggests that models may optimize this surrogate rather than the true adversarial risk. We formalize this notion as obscurity to an adversary, and develop tools and heuristics for identifying obscured models and designing transparent models. We demonstrate that this is a significant problem in practice by repurposing gradient-free optimization techniques into adversarial attacks, which we use to decrease the accuracy of several recently proposed defenses to near zero. Our hope is that our formulations and results will help researchers to develop more powerful defenses.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/UB5VS3Q9/Uesato and O’Donoghue - Adversarial Risk and the Dangers of Evaluating Aga.pdf}
}

@unpublished{Ulyanov2016,
  title = {Instance {{Normalization}}: {{The Missing Ingredient}} for {{Fast Stylization}}},
  author = {Ulyanov, Dmitry and Vedaldi, Andrea and Lempitsky, Victor},
  date = {2016},
  number = {2016},
  eprint = {1607.08022},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1607.08022},
  abstract = {It this paper we revisit the fast stylization method introduced in Ulyanov et. al. (2016). We show how a small change in the stylization architecture results in a significant qualitative improvement in the generated images. The change is limited to swapping batch normalization with instance normalization, and to apply the latter both at training and testing times. The resulting method can be used to train high-performance architectures for real-time image generation. The code will is made available on github at https://github.com/DmitryUlyanov/texture\_nets. Full paper can be found at arXiv:1701.02096.},
  file = {/Users/ryedida/Zotero/storage/NILMQJEU/Ulyanov, Vedaldi, Lempitsky - 2016 - Instance Normalization The Missing Ingredient for Fast Stylization(2).pdf}
}

@online{urbanDeepConvolutionalNets2017,
  title = {Do {{Deep Convolutional Nets Really Need}} to Be {{Deep}} and {{Convolutional}}?},
  author = {Urban, Gregor and Geras, Krzysztof J. and Kahou, Samira Ebrahimi and Aslan, Ozlem and Wang, Shengjie and Caruana, Rich and Mohamed, Abdelrahman and Philipose, Matthai and Richardson, Matt},
  date = {2017-03-03},
  eprint = {1603.05691},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1603.05691},
  urldate = {2024-01-21},
  abstract = {Yes, they do. This paper provides the first empirical demonstration that deep convolutional models really need to be both deep and convolutional, even when trained with methods such as distillation that allow small or shallow models of high accuracy to be trained. Although previous research showed that shallow feed-forward nets sometimes can learn the complex functions previously learned by deep nets while using the same number of parameters as the deep models they mimic, in this paper we demonstrate that the same methods cannot be used to train accurate models on CIFAR-10 unless the student models contain multiple layers of convolution. Although the student models do not have to be as deep as the teacher model they mimic, the students need multiple convolutional layers to learn functions of comparable accuracy as the deep convolutional teacher.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/IAF8A56M/Urban et al. - 2017 - Do Deep Convolutional Nets Really Need to be Deep .pdf}
}

@article{VanDerMaaten2008a,
  title = {Visualizing Data Using T-{{SNE}}},
  author = {Van Der Maaten, Laurens and Hinton, Geoffrey},
  date = {2008},
  journaltitle = {Journal of Machine Learning Research},
  volume = {9},
  pages = {2579--2625},
  issn = {15324435},
  abstract = {We present a new technique called "t-SNE" that visualizes high-dimensional data by giving each datapoint a location in a two or three-dimensional map. The technique is a variation of Stochastic Neighbor Embedding (Hinton and Roweis, 2002) that is much easier to optimize, and produces significantly better visualizations by reducing the tendency to crowd points together in the center of the map. t-SNE is better than existing techniques at creating a single map that reveals structure at many different scales. This is particularly important for high-dimensional data that lie on several different, but related, low-dimensional manifolds, such as images of objects from multiple classes seen from multiple viewpoints. For visualizing the structure of very large data sets, we show how t-SNE can use random walks on neighborhood graphs to allow the implicit structure of all of the data to influence the way in which a subset of the data is displayed. We illustrate the performance of t-SNE on a wide variety of data sets and compare it with many other non-parametric visualization techniques, including Sammon mapping, Isomap, and Locally Linear Embedding. The visualizations produced by t-SNE are significantly better than those produced by the other techniques on almost all of the data sets.},
  keywords = {Dimensionality reduction,Embedding algorithms,Manifold learning,Multidimensional scaling,Visualization},
  file = {/Users/ryedida/Zotero/storage/I6HSNW3D/Van Der Maaten, Hinton - 2008 - Visualizing data using t-SNE(2).pdf}
}

@book{vandervaartWeakConvergenceEmpirical2023,
  title = {Weak {{Convergence}} and {{Empirical Processes}}: {{With Applications}} to {{Statistics}}},
  shorttitle = {Weak {{Convergence}} and {{Empirical Processes}}},
  author = {Van Der Vaart, A. W. and Wellner, Jon A.},
  date = {2023},
  series = {Springer {{Series}} in {{Statistics}}},
  publisher = {{Springer International Publishing}},
  location = {{Cham}},
  doi = {10.1007/978-3-031-29040-4},
  url = {https://link.springer.com/10.1007/978-3-031-29040-4},
  urldate = {2023-11-20},
  isbn = {978-3-031-29038-1 978-3-031-29040-4},
  langid = {english},
  keywords = {Bracketing Entropy,Chaining,Concentration of Measure,Donsker Theorems,Empirical Processes,Glivenko-Cantelli Theorems,Majorizing Measures,Rates of Convergence,Skorokhod Space,Weak Convergence},
  file = {/Users/ryedida/Zotero/storage/E5AM6UY9/Van Der Vaart_Wellner_2023_Weak Convergence and Empirical Processes.pdf}
}

@report{vaswaniAttentionAllYou,
  title = {Attention {{Is All You Need}}},
  author = {Vaswani, Ashish and Brain, Google and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, Łukasz and Polosukhin, Illia},
  eprint = {1706.03762v5},
  eprinttype = {arxiv},
  abstract = {The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.}
}

@unpublished{vermaManifoldMixupBetter2019,
  title = {Manifold {{Mixup}}: {{Better Representations}} by {{Interpolating Hidden States}}},
  shorttitle = {Manifold {{Mixup}}},
  author = {Verma, Vikas and Lamb, Alex and Beckham, Christopher and Najafi, Amir and Mitliagkas, Ioannis and Courville, Aaron and Lopez-Paz, David and Bengio, Yoshua},
  date = {2019-05-11},
  eprint = {1806.05236},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1806.05236},
  urldate = {2021-04-02},
  abstract = {Deep networks often perform well on the data manifold on which they are trained, yet give incorrect (and often very confident) answers when evaluated on points from off of the training distribution. This is exemplified by the adversarial examples phenomenon but can also be seen in terms of model generalization and domain shift. We propose Manifold Mixup which encourages the network to produce more reasonable and less confident predictions at points with combinations of attributes not seen in the training set. This is accomplished by training on convex combinations of the hidden state representations of data samples. Using this method, we demonstrate improved semi-supervised learning, learning with limited labeled data, and robustness to adversarial examples. Manifold Mixup requires no (significant) additional computation. Analytical experiments on both real data and synthetic data directly support our hypothesis for why the Manifold Mixup method improves results.},
  langid = {english},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing,Statistics - Machine Learning},
  annotation = {20 citations (Semantic Scholar/arXiv) [2021-04-02]},
  file = {/Users/ryedida/Zotero/storage/AI3V7FGM/Verma et al. - 2019 - Manifold Mixup Better Representations by Interpol.pdf}
}

@article{wan2018perceptions,
  title = {Perceptions, Expectations, and Challenges in Defect Prediction},
  author = {Wan, Zhiyuan and Xia, Xin and Hassan, Ahmed E and Lo, David and Yin, Jianwei and Yang, Xiaohu},
  date = {2018},
  journaltitle = {IEEE Transactions on Software Engineering},
  volume = {46},
  number = {11},
  pages = {1241--1266},
  publisher = {{IEEE}}
}

@unpublished{Wang2017,
  title = {1 {{Machine Learning}} for {{Survival Analysis}}: {{A Survey}}},
  author = {Wang, Ping and Li, Y A N and Reddy, Chandan K},
  date = {2017},
  volume = {X},
  number = {X},
  eprint = {1708.04649v1},
  eprinttype = {arxiv},
  pages = {1--39},
  file = {/Users/ryedida/Zotero/storage/E2D3H6HT/Wang, Li, Reddy - 2017 - 1 Machine Learning for Survival Analysis A Survey(2).pdf}
}

@article{Wang2018,
  title = {Formal Security Analysis of Neural Networks Using Symbolic Intervals},
  author = {Wang, Shiqi and Pei, Kexin and Whitehouse, Justin and Yang, Junfeng and Jana, Suman},
  date = {2018},
  journaltitle = {Proceedings of the 27th USENIX Security Symposium},
  eprint = {1804.10829},
  eprinttype = {arxiv},
  pages = {1599--1614},
  abstract = {Due to the increasing deployment of Deep Neural Networks (DNNs) in real-world security-critical domains including autonomous vehicles and collision avoidance systems, formally checking security properties of DNNs, especially under different attacker capabilities, is becoming crucial. Most existing security testing techniques for DNNs try to find adversarial examples without providing any formal security guarantees about the non-existence of such adversarial examples. Recently, several projects have used different types of Satisfiability Modulo Theory (SMT) solvers to formally check security properties of DNNs. However, all of these approaches are limited by the high overhead caused by the solver. In this paper, we present a new direction for formally checking security properties of DNNs without using SMT solvers. Instead, we leverage interval arithmetic to compute rigorous bounds on the DNN outputs. Our approach, unlike existing solver-based approaches, is easily paral-lelizable. We further present symbolic interval analysis along with several other optimizations to minimize over-estimations of output bounds. We design, implement, and evaluate our approach as part of ReluVal, a system for formally checking security properties of Relu-based DNNs. Our extensive empirical results show that ReluVal outperforms Reluplex, a state-of-the-art solver-based system, by 200 times on average. On a single 8-core machine without GPUs, within 4 hours, ReluVal is able to verify a security property that Reluplex deemed inconclusive due to timeout after running for more than 5 days. Our experiments demonstrate that symbolic interval analysis is a promising new direction towards rigorously analyzing different security properties of DNNs.},
  isbn = {9781939133045},
  file = {/Users/ryedida/Zotero/storage/TK93WRAT/Wang et al. - 2018 - Formal security analysis of neural networks using symbolic intervals(2).pdf}
}

@inproceedings{wang2018there,
  title = {Is There a" Golden" Feature Set for Static Warning Identification? An Experimental Evaluation},
  booktitle = {Proceedings of the 12th {{ACM}}/{{IEEE}} International Symposium on Empirical Software Engineering and Measurement},
  author = {Wang, Junjie and Wang, Song and Wang, Qing},
  date = {2018},
  pages = {1--10}
}

@article{wangAutomaticallyLearningSemantic2016,
  title = {Automatically Learning Semantic Features for Defect Prediction},
  author = {Wang, Song and Liu, Taiyue and Tan, Lin},
  date = {2016},
  journaltitle = {Proceedings - International Conference on Software Engineering},
  volume = {14-22-May-},
  pages = {297--308},
  publisher = {{ACM}},
  issn = {02705257},
  doi = {10.1145/2884781.2884804},
  abstract = {Software defect prediction, which predicts defective code regions, can help developers find bugs and prioritize their testing efforts. To build accurate prediction models, previous studies focus on manually designing features that encode the characteristics of programs and exploring different machine learning algorithms. Existing traditional features often fail to capture the semantic differences of programs, and such a capability is needed for building accurate prediction models. To bridge the gap between programs' semantics and defect prediction features, this paper proposes to leverage a powerful representation-learning algorithm, deep learning, to learn semantic representation of programs automatically from source code. Specifically, we leverage Deep Belief Network (DBN) to automatically learn semantic features from token vectors extracted from programs' Abstract Syntax Trees (ASTs). Our evaluation on ten open source projects shows that our automatically learned semantic features significantly improve both within-project defect prediction (WPDP) and cross-project defect prediction (CPDP) compared to traditional features. Our semantic features improve WPDP on average by 14.7\% in precision, 11.5\% in recall, and 14.2\% in F1. For CPDP, our semantic features based approach outperforms the state-of-the-art technique TCA+ with traditional features by 8.9\% in F1.},
  isbn = {9781450339001},
  keywords = {Abstract syntax trees,Art,artifical intelligence,Artificial intelligence,Attribute selection,Bayesian methods,benchmarking.,bonferroni adjustment,classification,Clone detection,Code clone detection,code similarities,communication channels,confidence interval,data analytics for software,data mining,Data mining,Data mining detect prediction,deep learning,Deep learning,Defect prediction,defect predictor learning,differential evolution,effect size,empirical,Empirical,Empirical software engineering,engineering,Feature extraction,Feature location,Financial management,Halstead,Information need,Information retrieval,Language models,learning (artificial intelligence),Learning systems,lines of code counts,Machine learning,McCabe,McCabes versus Halstead,Mining software engineering repositories,naive Bayes.,Neu-ral networks,neural networks,NLP,non-parametric test,parameter,parametric test,Predictive models,program diagnostics,program testing,Query quality,resource-bound exploration,Risky Software Commits,search based software engineering,Security,Semantics,sentiment analysis,social software engineering,software analytics,Software Analytics,Software Metrics,Software Prediction,software quality,Software quality,Software systems,Software testing,Software vulnerability prediction,static code attributes,Static code features,statistical difference,survey,svm,System recovery,System testing,systematic review,Test collection challenge,tuning,WHICH},
  file = {/Users/ryedida/Zotero/storage/8JSVMPEP/Wang, Liu, Tan - 2016 - Automatically learning semantic features for defect prediction(2).pdf}
}

@article{wangNoiseModulationLet,
  title = {Noise {{Modulation}}: {{Let Your Model Interpret Itself}}},
  author = {Wang, Haoyang Li Xinggang},
  pages = {10},
  abstract = {Given the great success of Deep Neural Networks(DNNs) and the black-box nature of it, the interpretability of these models becomes an important issue. The majority of previous research works on the post-hoc interpretation of a trained model. But recently, adversarial training shows that it is possible for a model to have an interpretable input-gradient through training. However, adversarial training lacks efficiency for interpretability. To resolve this problem, we construct an approximation of the adversarial perturbations and discover a connection between adversarial training and amplitude modulation. Based on a digital analogy, we propose noise modulation as an efficient and model-agnostic alternative to train a model that interprets itself with input-gradients. Experiment results show that noise modulation can effectively increase the interpretability of input-gradients model-agnosticly.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/6NPVKXA7/Wang - Noise Modulation Let Your Model Interpret Itself.pdf}
}

@article{wattenbergHowUseTSNE2016,
  title = {How to Use T-{{SNE}} Effectively},
  author = {Wattenberg, Martin and Viégas, Fernanda and Johnson, Ian},
  date = {2016},
  journaltitle = {Distill},
  volume = {1},
  number = {10},
  pages = {e2},
  isbn = {2476-0757}
}

@article{Wei2016,
  title = {Exploring Characteristics of Suspended Users and Network Stability on {{Twitter}}},
  author = {Wei, Wei and Joseph, Kenneth and Liu, Huan and Carley, Kathleen M.},
  date = {2016},
  journaltitle = {Social Network Analysis and Mining},
  volume = {6},
  number = {1},
  publisher = {{Springer Vienna}},
  issn = {18695469},
  doi = {10.1007/s13278-016-0358-5},
  abstract = {- Social media is rapidly becoming a medium of choice for understanding the cultural pulse of a region; e.g. for identifying what the population is concerned with and what kind of help is needed in a crisis. To assess this cultural pulse, it is critical to have an accurate assessment of who is saying what. Unfortunately, social media is also the home of users who engage in disruptive, disingenuous, and potentially illegal activity. A range of users, both human and non-human, carry out such social cyber-attacks. We ask, to what extent does the presence or absence of such users influence our ability to assess the cultural pulse of a region? Our prior research on this topic showed that Twitter-based network structures and content are unstable and can be highly impacted by the removal of suspended users. Because of this, statistical techniques can be established to differentiate potential types of suspended and non-suspended users. In this extended paper, we develop additional experiments to explore the spatial patterns of suspended users, and we further consider how these users affect structural and content concentrations via the development of new metrics and new analyses. We find significant evidence that suspended users exist on the periphery of social networks on Twitter and consequently that removing them has little impact on network structure. We also improve prior attempts to distinguish among different types of suspended users by using a much larger dataset. Finally, we conduct a temporal sentiment analysis to illustrate differences between suspended users and non-suspended users on this dimension.},
  isbn = {- 1869-5469},
  file = {/Users/ryedida/Zotero/storage/KY2Q6FG9/Wei et al. - 2016 - Exploring characteristics of suspended users and network stability on Twitter(2).pdf}
}

@article{weiChainofThoughtPromptingElicits,
  title = {Chain-of-{{Thought Prompting Elicits Reasoning}} in {{Large Language Models}}},
  author = {Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and Ichter, Brian and Xia, Fei and Chi, Ed H and Le, Quoc V and Zhou, Denny},
  abstract = {We explore how generating a chain of thought—a series of intermediate reasoning steps—significantly improves the ability of large language models to perform complex reasoning. In particular, we show how such reasoning abilities emerge naturally in sufficiently large language models via a simple method called chain-ofthought prompting, where a few chain of thought demonstrations are provided as exemplars in prompting. Experiments on three large language models show that chain-of-thought prompting improves performance on a range of arithmetic, commonsense, and symbolic reasoning tasks. The empirical gains can be striking. For instance, prompting a PaLM 540B with just eight chain-of-thought exemplars achieves state-of-the-art accuracy on the GSM8K benchmark of math word problems, surpassing even finetuned GPT-3 with a verifier.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/LLAVV362/Wei et al. - Chain-of-Thought Prompting Elicits Reasoning in La.pdf}
}

@unpublished{weiFinetunedLanguageModels2021,
  title = {Finetuned {{Language Models Are Zero-Shot Learners}}},
  author = {Wei, Jason and Bosma, Maarten and Zhao, Vincent Y. and Guu, Kelvin and Yu, Adams Wei and Lester, Brian and Du, Nan and Dai, Andrew M. and Le, Quoc V.},
  date = {2021-12-01},
  eprint = {2109.01652},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2109.01652},
  urldate = {2022-01-07},
  abstract = {This paper explores a simple method for improving the zero-shot learning abilities of language models. We show that instruction tuning—finetuning language models on a collection of datasets described via instructions—substantially boosts zeroshot performance on unseen tasks. We take a 137B parameter pretrained language model and instruction tune it on over 60 NLP datasets verbalized via natural language instruction templates. We evaluate this instruction-tuned model, which we call FLAN, on unseen task types. FLAN substantially improves the performance of its unmodified counterpart and surpasses zero-shot 175B GPT-3 on 20 of 25 datasets that we evaluate. FLAN even outperforms few-shot GPT-3 by a large margin on ANLI, RTE, BoolQ, AI2-ARC, OpenbookQA, and StoryCloze. Ablation studies reveal that number of datasets and model scale are key components to the success of instruction tuning.},
  langid = {english},
  keywords = {Computer Science - Computation and Language},
  annotation = {27 citations (Semantic Scholar/arXiv) [2022-01-07]},
  file = {/Users/ryedida/Zotero/storage/W488Y7KT/Wei et al. - 2021 - Finetuned Language Models Are Zero-Shot Learners.pdf}
}

@unpublished{weiPrototypicalClassifierRobust2021,
  title = {Prototypical {{Classifier}} for {{Robust Class-Imbalanced Learning}}},
  author = {Wei, Tong and Shi, Jiang-Xin and Li, Yu-Feng and Zhang, Min-Ling},
  date = {2021-10-21},
  eprint = {2110.11553},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2110.11553},
  urldate = {2021-11-28},
  abstract = {Deep neural networks have been shown to be very powerful methods for many supervised learning tasks. However, they can also easily overfit to training set biases, i.e., label noise and class imbalance. While both learning with noisy labels and class-imbalanced learning have received tremendous attention, existing works mainly focus on one of these two training set biases. To fill the gap, we propose Prototypical Classifier, which does not require fitting additional parameters given the embedding network. Unlike conventional classifiers that are biased towards head classes, Prototypical Classifier produces balanced and comparable predictions for all classes even though the training set is class-imbalanced. By leveraging this appealing property, we can easily detect noisy labels by thresholding the confidence scores predicted by Prototypical Classifier, where the threshold is dynamically adjusted through the iteration. A sample reweghting strategy is then applied to mitigate the influence of noisy labels. We test our method on CIFAR-10-LT, CIFAR-100-LT and Webvision datasets, observing that Prototypical Classifier obtains substaintial improvements compared with state of the arts.},
  langid = {english},
  keywords = {Computer Science - Computer Vision and Pattern Recognition},
  annotation = {0 citations (Semantic Scholar/arXiv) [2021-11-28]},
  file = {/Users/ryedida/Zotero/storage/ILKD2UY3/Wei et al. - 2021 - Prototypical Classifier for Robust Class-Imbalance.pdf}
}

@article{wellingBayesianLearningStochastic,
  title = {Bayesian {{Learning}} via {{Stochastic Gradient Langevin Dynamics}}},
  author = {Welling, Max and Teh, Yee Whye},
  abstract = {In this paper we propose a new framework for learning from large scale datasets based on iterative learning from small mini-batches. By adding the right amount of noise to a standard stochastic gradient optimization algorithm we show that the iterates will converge to samples from the true posterior distribution as we anneal the stepsize. This seamless transition between optimization and Bayesian posterior sampling provides an inbuilt protection against overfitting. We also propose a practical method for Monte Carlo estimates of posterior statistics which monitors a “sampling threshold” and collects samples after it has been surpassed. We apply the method to three models: a mixture of Gaussians, logistic regression and ICA with natural gradients.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/DSAF6MPE/Welling and Teh - Bayesian Learning via Stochastic Gradient Langevin.pdf}
}

@article{wellingBayesianLearningStochastica,
  title = {Bayesian {{Learning}} via {{Stochastic Gradient Langevin Dynamics}}},
  author = {Welling, Max and Teh, Yee Whye},
  abstract = {In this paper we propose a new framework for learning from large scale datasets based on iterative learning from small mini-batches. By adding the right amount of noise to a standard stochastic gradient optimization algorithm we show that the iterates will converge to samples from the true posterior distribution as we anneal the stepsize. This seamless transition between optimization and Bayesian posterior sampling provides an inbuilt protection against overfitting. We also propose a practical method for Monte Carlo estimates of posterior statistics which monitors a “sampling threshold” and collects samples after it has been surpassed. We apply the method to three models: a mixture of Gaussians, logistic regression and ICA with natural gradients.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/A5KKLK9E/Welling and Teh - Bayesian Learning via Stochastic Gradient Langevin.pdf}
}

@article{wen2018well,
  title = {How Well Do Change Sequences Predict Defects? Sequence Learning from Software Changes},
  author = {Wen, Ming and Wu, Rongxin and Cheung, Shing-Chi},
  date = {2018},
  journaltitle = {IEEE Transactions on Software Engineering},
  publisher = {{IEEE}}
}

@article{wenSharpnessMinimizationAlgorithms,
  title = {Sharpness {{Minimization Algorithms Do Not Only Minimize Sharpness To Achieve Better Generalization}}},
  author = {Wen, Kaiyue and Li, Zhiyuan},
  abstract = {Despite extensive studies, the underlying reason as to why overparameterized neural networks can generalize remains elusive. Existing theory shows that common stochastic optimizers prefer flatter minimizers of the training loss, and thus a natural potential explanation is that flatness implies generalization. This work critically examines this explanation. Through theoretical and empirical investigation, we identify the following three scenarios for two-layer ReLU networks: (1) flatness provably implies generalization; (2) there exist non-generalizing flattest models and sharpness minimization algorithms fail to generalize poorly, and (3) perhaps most strikingly, there exist non-generalizing flattest models, but sharpness minimization algorithms still generalize. Our results suggest that the relationship between sharpness and generalization subtly depends on the data distributions and the model architectures and sharpness minimization algorithms do not only minimize sharpness to achieve better generalization. This calls for the search for other explanations for the generalization of over-parameterized neural networks.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/RT46LX4S/Wen and Li - Sharpness Minimization Algorithms Do Not Only Mini.pdf}
}

@inproceedings{white2015toward,
  title = {Toward Deep Learning Software Repositories},
  booktitle = {2015 {{IEEE}}/{{ACM}} 12th {{Working Conference}} on {{Mining Software Repositories}}},
  author = {White, Martin and Vendome, Christopher and Linares-Vásquez, Mario and Poshyvanyk, Denys},
  date = {2015},
  pages = {334--345},
  publisher = {{IEEE}}
}

@online{whiteNeuralArchitectureSearch2023,
  title = {Neural {{Architecture Search}}: {{Insights}} from 1000 {{Papers}}},
  shorttitle = {Neural {{Architecture Search}}},
  author = {White, Colin and Safari, Mahmoud and Sukthanker, Rhea and Ru, Binxin and Elsken, Thomas and Zela, Arber and Dey, Debadeepta and Hutter, Frank},
  date = {2023-01-25},
  eprint = {2301.08727},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/2301.08727},
  urldate = {2024-01-22},
  abstract = {In the past decade, advances in deep learning have resulted in breakthroughs in a variety of areas, including computer vision, natural language understanding, speech recognition, and reinforcement learning. Specialized, high-performing neural architectures are crucial to the success of deep learning in these areas. Neural architecture search (NAS), the process of automating the design of neural architectures for a given task, is an inevitable next step in automating machine learning and has already outpaced the best human-designed architectures on many tasks. In the past few years, research in NAS has been progressing rapidly, with over 1000 papers released since 2020 (Deng and Lindauer, 2021). In this survey, we provide an organized and comprehensive guide to neural architecture search. We give a taxonomy of search spaces, algorithms, and speedup techniques, and we discuss resources such as benchmarks, best practices, other surveys, and open-source libraries.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/9VAFVWTZ/White et al. - 2023 - Neural Architecture Search Insights from 1000 Pap.pdf}
}

@article{wignerDistributionRootsCertain1958,
  title = {On the Distribution of the Roots of Certain Symmetric Matrices},
  author = {Wigner, Eugene P.},
  date = {1958},
  journaltitle = {Annals of Mathematics},
  volume = {67},
  number = {2},
  pages = {325--327},
  publisher = {{JSTOR}},
  isbn = {0003-486X},
  file = {/Users/ryedida/Zotero/storage/922GPNR8/Wigner_1958_On the distribution of the roots of certain symmetric matrices.pdf}
}

@article{Wigness2018,
  title = {Efficient {{Label Collection}} for {{Image Datasets}} via {{Hierarchical Clustering}}},
  author = {Wigness, Maggie and Draper, Bruce A. and Beveridge, J. Ross},
  date = {2018},
  journaltitle = {International Journal of Computer Vision},
  volume = {126},
  number = {1},
  pages = {59--85},
  publisher = {{Springer US}},
  issn = {15731405},
  doi = {10.1007/s11263-017-1039-1},
  isbn = {9781467369640},
  keywords = {Efficient label collection,Hierarchical clustering,Image classification,Visual concept discovery},
  file = {/Users/ryedida/Zotero/storage/FVZVVWNH/Wigness, Draper, Beveridge - 2018 - Efficient Label Collection for Image Datasets via Hierarchical Clustering(2).pdf}
}

@book{williams2006gaussian,
  title = {Gaussian Processes for Machine Learning},
  author = {Williams, Christopher KI and Rasmussen, Carl Edward},
  date = {2006},
  volume = {2},
  number = {3},
  publisher = {{MIT press Cambridge, MA}}
}

@article{Winchell,
  title = {Can {{Textbook Annotations Serve}} as an {{Early Predictor}} of {{Student Learning}} ?},
  author = {Winchell, Adam and Mozer, Michael and Lan, Andrew and Grimaldi, Phillip and Pashler, Harold},
  file = {/Users/ryedida/Zotero/storage/ZF2PBMZV/Winchell et al. - Unknown - Can Textbook Annotations Serve as an Early Predictor of Student Learning(2).pdf}
}

@inproceedings{wohlin2014guidelines,
  title = {Guidelines for Snowballing in Systematic Literature Studies and a Replication in Software Engineering},
  booktitle = {Proceedings of the 18th International Conference on Evaluation and Assessment in Software Engineering},
  author = {Wohlin, Claes},
  date = {2014},
  pages = {1--10}
}

@article{Wong2018,
  title = {Provable Defenses against Adversarial Examples via the Convex Outer Adversarial Polytope},
  author = {Wong, Eric and Kolter, J. Zico},
  date = {2018},
  journaltitle = {35th International Conference on Machine Learning, ICML 2018},
  volume = {12},
  eprint = {1711.00851},
  eprinttype = {arxiv},
  pages = {8405--8423},
  abstract = {We propose a method to learn deep ReLU-based classifiers that are provably robust against normbounded adversarial perturbations on the training data. For previously unseen examples, the approach is guaranteed to detect all adversarial examples, though it may flag some non-adversarial examples as well. The basic idea is to consider a convex outer approximation of the set of activations reachable through a norm-bounded perturbation, and we develop a robust optimization procedure that minimizes the worst case loss over this outer region (via a linear program). Crucially, we show that the dual problem to this linear program can be represented itself as a deep network similar to the backpropagation network, leading to very efficient optimization approaches that produce guaranteed bounds on the robust loss. The end result is that by executing a few more forward and backward passes through a slightly modified version of the original network (though possibly with much larger batch sizes), we can learn a classifier that is provably robust to any norm-bounded adversarial attack. We illustrate the approach on a number of tasks to train classifiers with robust adversarial guarantees (e.g. for MNIST, we produce a convolutional classifier that provably has less than 5.8\% test error for any adversarial attack with bounded ℓ∞ norm less than ϵ = 0.1), and code for all experiments is available at http://github.com/locuslab/convex-adversarial.},
  isbn = {9781510867963}
}

@article{Wu2018,
  title = {Group Normalization},
  author = {Wu, Yuxin and He, Kaiming},
  date = {2018},
  journaltitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  volume = {11217 LNCS},
  eprint = {1803.08494v3},
  eprinttype = {arxiv},
  pages = {3--19},
  issn = {16113349},
  doi = {10.1007/978-3-030-01261-8_1},
  abstract = {Batch Normalization (BN) is a milestone technique in the development of deep learning, enabling various networks to train. However, normalizing along the batch dimension introduces problems --- BN's error increases rapidly when the batch size becomes smaller, caused by inaccurate batch statistics estimation. This limits BN's usage for training larger models and transferring features to computer vision tasks including detection, segmentation, and video, which require small batches constrained by memory consumption. In this paper, we present Group Normalization (GN) as a simple alternative to BN. GN divides the channels into groups and computes within each group the mean and variance for normalization. GN's computation is independent of batch sizes, and its accuracy is stable in a wide range of batch sizes. On ResNet-50 trained in ImageNet, GN has 10.6\% lower error than its BN counterpart when using a batch size of 2; when using typical batch sizes, GN is comparably good with BN and outperforms other normalization variants. Moreover, GN can be naturally transferred from pre-training to fine-tuning. GN can outperform its BN-based counterparts for object detection and segmentation in COCO, and for video classification in Kinetics, showing that GN can effectively replace the powerful BN in a variety of tasks. GN can be easily implemented by a few lines of code in modern libraries.},
  isbn = {9783030012601},
  file = {/Users/ryedida/Zotero/storage/MM6EAENT/Wu, He - 2018 - Group normalization(2).pdf}
}

@unpublished{Wu2018a,
  title = {{{WNGrad}}: {{Learn}} the {{Learning Rate}} in {{Gradient Descent}}},
  author = {Wu, Xiaoxia and Ward, Rachel and Bottou, Léon},
  date = {2018},
  eprint = {1803.02865},
  eprinttype = {arxiv},
  pages = {1--16},
  url = {http://arxiv.org/abs/1803.02865},
  abstract = {Adjusting the learning rate schedule in stochastic gradient methods is an important unresolved problem which requires tuning in practice. If certain parameters of the loss function such as smoothness or strong convexity constants are known, theoretical learning rate schedules can be applied. However, in practice, such parameters are not known, and the loss function of interest is not convex in any case. The recently proposed batch normalization reparametrization is widely adopted in most neural network architectures today because, among other advantages, it is robust to the choice of Lipschitz constant of the gradient in loss function, allowing one to set a large learning rate without worry. Inspired by batch normalization, we propose a general nonlinear update rule for the learning rate in batch and stochastic gradient descent so that the learning rate can be initialized at a high value, and is subsequently decreased according to gradient observations along the way. The proposed method is shown to achieve robustness to the relationship between the learning rate and the Lipschitz constant, and near-optimal convergence rates in both the batch and stochastic settings (\$O(1/T)\$ for smooth loss in the batch setting, and \$O(1/\textbackslash sqrt\{T\})\$ for convex loss in the stochastic setting). We also show through numerical evidence that such robustness of the proposed method extends to highly nonconvex and possibly non-smooth loss function in deep learning problems.Our analysis establishes some first theoretical understanding into the observed robustness for batch normalization and weight normalization.},
  file = {/Users/ryedida/Zotero/storage/2XZCV22G/Wu, Ward, Bottou - 2018 - WNGrad Learn the Learning Rate in Gradient Descent(2).pdf}
}

@unpublished{wuAdversarialWeightPerturbation2020,
  title = {Adversarial {{Weight Perturbation Helps Robust Generalization}}},
  author = {Wu, Dongxian and Xia, Shu-tao and Wang, Yisen},
  date = {2020-10-13},
  eprint = {2004.05884},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/2004.05884},
  urldate = {2021-04-22},
  abstract = {The study on improving the robustness of deep neural networks against adversarial examples grows rapidly in recent years. Among them, adversarial training is the most promising one, based on which, a lot of improvements have been developed, such as adding regularizations or leveraging unlabeled data. However, these improvements seem to come from isolated perspectives, so that we are curious about if there is something in common behind them. In this paper, we investigate the surface geometry of several well-recognized adversarial training variants, and reveal that their adversarial loss landscape is closely related to the adversarially robust generalization, i.e., the flatter the adversarial loss landscape, the smaller the adversarially robust generalization gap. Based on this finding, we then propose a simple yet effective module, Adversarial Weight Perturbation (AWP), to directly regularize the flatness of the adversarial loss landscape in the adversarial training framework. Extensive experiments demonstrate that AWP indeed owns flatter landscape and can be easily incorporated into various adversarial training variants to enhance their adversarial robustness further.},
  langid = {english},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Statistics - Machine Learning},
  annotation = {27 citations (Semantic Scholar/arXiv) [2021-04-22]},
  file = {/Users/ryedida/Zotero/storage/9M6FWC96/Wu et al. - 2020 - Adversarial Weight Perturbation Helps Robust Gener.pdf}
}

@online{wuImplicitRegularizationDynamical2023,
  title = {The {{Implicit Regularization}} of {{Dynamical Stability}} in {{Stochastic Gradient Descent}}},
  author = {Wu, Lei and Su, Weijie J.},
  date = {2023-06-01},
  eprint = {2305.17490},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/2305.17490},
  urldate = {2023-12-12},
  abstract = {In this paper, we study the implicit regularization of stochastic gradient descent (SGD) through the lens of dynamical stability (Wu et al., 2018). We start by revising existing stability analyses of SGD, showing how the Frobenius norm and trace of Hessian relate to different notions of stability. Notably, if a global minimum is linearly stable for SGD, then the trace of Hessian must be less than or equal to 2/η, where η denotes the learning rate. By contrast, for gradient descent (GD), the stability imposes a similar constraint but only on the largest eigenvalue of Hessian. We then turn to analyze the generalization properties of these stable minima, focusing specifically on two-layer ReLU networks and diagonal linear networks. Notably, we establish the equivalence between these metrics of sharpness and certain parameter norms for the two models, which allows us to show that the stable minima of SGD provably generalize well. By contrast, the stability-induced regularization of GD is provably too weak to ensure satisfactory generalization. This discrepancy provides an explanation of why SGD often generalizes better than GD. Note that the learning rate (LR) plays a pivotal role in the strength of stability-induced regularization. As the LR increases, the regularization effect becomes more pronounced, elucidating why SGD with a larger LR consistently demonstrates superior generalization capabilities. Additionally, numerical experiments are provided to support our theoretical findings.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/ILYNU5IM/Wu and Su - 2023 - The Implicit Regularization of Dynamical Stability.pdf}
}

@unpublished{Xiang2017,
  title = {Reachable {{Set Computation}} and {{Safety Verification}} for {{Neural Networks}} with {{ReLU Activations}}},
  author = {Xiang, Weiming and Tran, Hoang-Dung and Johnson, Taylor T.},
  date = {2017},
  eprint = {1712.08163},
  eprinttype = {arxiv},
  pages = {1--19},
  url = {http://arxiv.org/abs/1712.08163},
  abstract = {Neural networks have been widely used to solve complex real-world problems. Due to the complicate, nonlinear, non-convex nature of neural networks, formal safety guarantees for the output behaviors of neural networks will be crucial for their applications in safety-critical systems.In this paper, the output reachable set computation and safety verification problems for a class of neural networks consisting of Rectified Linear Unit (ReLU) activation functions are addressed. A layer-by-layer approach is developed to compute output reachable set. The computation is formulated in the form of a set of manipulations for a union of polyhedra, which can be efficiently applied with the aid of polyhedron computation tools. Based on the output reachable set computation results, the safety verification for a ReLU neural network can be performed by checking the intersections of unsafe regions and output reachable set described by a union of polyhedra. A numerical example of a randomly generated ReLU neural network is provided to show the effectiveness of the approach developed in this paper.},
  file = {/Users/ryedida/Zotero/storage/QVR88M2N/Xiang, Tran, Johnson - 2017 - Reachable Set Computation and Safety Verification for Neural Networks with ReLU Activations(2).pdf}
}

@article{Xiang2018,
  title = {Output {{Reachable Set Estimation}} and {{Verification}} for {{Multilayer Neural Networks}}},
  author = {Xiang, Weiming and Tran, Hoang Dung and Johnson, Taylor T.},
  date = {2018},
  journaltitle = {IEEE Transactions on Neural Networks and Learning Systems},
  volume = {29},
  number = {11},
  eprint = {1708.03322},
  eprinttype = {arxiv},
  pages = {5777--5783},
  publisher = {{IEEE}},
  issn = {21622388},
  doi = {10.1109/TNNLS.2018.2808470},
  abstract = {In this brief, the output reachable estimation and safety verification problems for multilayer perceptron (MLP) neural networks are addressed. First, a conception called maximum sensitivity is introduced, and for a class of MLPs whose activation functions are monotonic functions, the maximum sensitivity can be computed via solving convex optimization problems. Then, using a simulation-based method, the output reachable set estimation problem for neural networks is formulated into a chain of optimization problems. Finally, an automated safety verification is developed based on the output reachable set estimation result. An application to the safety verification for a robotic arm model with two joints is presented to show the effectiveness of the proposed approaches.},
  keywords = {Multilayer perceptron (MLP),reachable set estimation,simulation,verification},
  file = {/Users/ryedida/Zotero/storage/SSCZ3IJX/Xiang, Tran, Johnson - 2018 - Output Reachable Set Estimation and Verification for Multilayer Neural Networks(2).pdf}
}

@inproceedings{xianZeroShotLearningGood2017,
  title = {Zero-{{Shot Learning}} — {{The Good}}, the {{Bad}} and the {{Ugly}}},
  booktitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  author = {Xian, Yongqin and Schiele, Bernt and Akata, Zeynep},
  date = {2017-07},
  pages = {3077--3086},
  publisher = {{IEEE}},
  location = {{Honolulu, HI}},
  doi = {10.1109/CVPR.2017.328},
  url = {http://ieeexplore.ieee.org/document/8099811/},
  urldate = {2021-04-01},
  abstract = {Due to the importance of zero-shot learning, the number of proposed approaches has increased steadily recently. We argue that it is time to take a step back and to analyze the status quo of the area. The purpose of this paper is threefold. First, given the fact that there is no agreed upon zeroshot learning benchmark, we first define a new benchmark by unifying both the evaluation protocols and data splits. This is an important contribution as published results are often not comparable and sometimes even flawed due to, e.g. pre-training on zero-shot test classes. Second, we compare and analyze a significant number of the state-of-theart methods in depth, both in the classic zero-shot setting but also in the more realistic generalized zero-shot setting. Finally, we discuss limitations of the current status of the area which can be taken as a basis for advancing it.},
  eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  isbn = {978-1-5386-0457-1},
  langid = {english},
  annotation = {379 citations (Semantic Scholar/DOI) [2021-04-01]},
  file = {/Users/ryedida/Zotero/storage/2QC6HG37/Xian et al. - 2017 - Zero-Shot Learning — The Good, the Bad and the Ugl.pdf}
}

@unpublished{xiaoEnhancingAdversarialDefense2019,
  title = {Enhancing {{Adversarial Defense}} by K-{{Winners-Take-All}}},
  author = {Xiao, Chang and Zhong, Peilin and Zheng, Changxi},
  date = {2019-10-28},
  eprint = {1905.10510},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1905.10510},
  urldate = {2021-04-09},
  abstract = {We propose a simple change to the current neural network structure for defending against gradient-based adversarial attacks. Instead of using popular activation functions (such as ReLU), we advocate the use of k-Winners-Take-All (k-WTA) activation, a C0 discontinuous function that purposely invalidates the neural network model’s gradient at densely distributed input data points. Our proposal is theoretically rationalized. We show why the discontinuities in k-WTA networks can largely prevent gradient-based search of adversarial examples and why they at the same time remain innocuous to the network training. This understanding is also empirically backed. Even without notoriously expensive adversarial training, the robustness performance of our networks is comparable to conventional ReLU networks optimized by adversarial training. Furthermore, after also optimized through adversarial training, our networks outperform the state-of-the-art methods under white-box attacks on various datasets that we experimented with.},
  langid = {english},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Cryptography and Security,Computer Science - Data Structures and Algorithms,Computer Science - Machine Learning,Statistics - Machine Learning},
  annotation = {9 citations (Semantic Scholar/arXiv) [2021-04-09]},
  file = {/Users/ryedida/Zotero/storage/JZIX58LJ/Xiao et al. - 2019 - Enhancing Adversarial Defense by k-Winners-Take-Al.pdf}
}

@article{xiaROBUSTEARLYLEARNINGHINDERING2021,
  title = {{{ROBUST EARLY-LEARNING}}: {{HINDERING THE MEMORIZATION OF NOISY LABELS}}},
  author = {Xia, Xiaobo and Liu, Tongliang and Han, Bo and Gong, Chen and Wang, Nannan and Ge, Zongyuan and Chang, Yi},
  date = {2021},
  pages = {15},
  abstract = {The memorization effects of deep networks show that they will first memorize training data with clean labels and then those with noisy labels. The early stopping method therefore can be exploited for learning with noisy labels. However, the side effect brought by noisy labels will influence the memorization of clean labels before early stopping. In this paper, motivated by the lottery ticket hypothesis which shows that only partial parameters are important for generalization, we find that only partial parameters are important for fitting clean labels and generalize well, which we term as critical parameters; while the other parameters tend to fit noisy labels and cannot generalize well, which we term as non-critical parameters. Based on this, we propose robust early-learning to reduce the side effect of noisy labels before early stopping and thus enhance the memorization of clean labels. Specifically, in each iteration, we divide all parameters into the critical and non-critical ones, and then perform different update rules for different types of parameters. Extensive experiments on benchmark-simulated and real-world label-noise datasets demonstrate the superiority of the proposed method over the state-of-the-art label-noise learning methods.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/4BREKSWI/Xia et al. - 2021 - ROBUST EARLY-LEARNING HINDERING THE MEMORIZATION .pdf}
}

@article{Xie2018,
  title = {Mitigating Adversarial Effects through Randomization},
  author = {Xie, Cihang and Zhang, Zhishuai and Yuille, Alan L. and Wang, Jianyu and Ren, Zhou},
  date = {2018},
  journaltitle = {6th International Conference on Learning Representations, ICLR 2018 - Conference Track Proceedings},
  eprint = {1711.01991},
  eprinttype = {arxiv},
  pages = {1--16},
  abstract = {Convolutional neural networks have demonstrated high accuracy on various tasks in recent years. However, they are extremely vulnerable to adversarial examples. For example, imperceptible perturbations added to clean images can cause convolutional neural networks to fail. In this paper, we propose to utilize randomization at inference time to mitigate adversarial effects. Specifically, we use two randomization operations: random resizing, which resizes the input images to a random size, and random padding, which pads zeros around the input images in a random manner. Extensive experiments demonstrate that the proposed randomization method is very effective at defending against both single-step and iterative attacks. Our method provides the following advantages: 1) no additional training or fine-tuning, 2) very few additional computations, 3) compatible with other adversarial defense methods. By combining the proposed randomization method with an adversarially trained model, it achieves a normalized score of 0.924 (ranked No.2 among 107 defense teams) in the NIPS 2017 adversarial examples defense challenge, which is far better than using adversarial training alone with a normalized score of 0.773 (ranked No.56). The code is public available at https://github.com/cihangxie/NIPS2017\_adv\_challenge\_defense.},
  file = {/Users/ryedida/Zotero/storage/BW5LENGJ/Xie et al. - 2018 - Mitigating adversarial effects through randomization(2).pdf}
}

@unpublished{xie2018snas,
  title = {{{SNAS}}: Stochastic Neural Architecture Search},
  author = {Xie, Sirui and Zheng, Hehui and Liu, Chunxiao and Lin, Liang},
  date = {2018},
  eprint = {1812.09926},
  eprinttype = {arxiv}
}

@inproceedings{xu15fse,
  title = {Hey, You Have given Me Too Many Knobs!: {{Understanding}} and Dealing with over-{{Designed}} Configuration in System Software},
  booktitle = {Proceedings of the 2015 10th Joint Meeting on Foundations of Software Engineering},
  author = {Xu, Tianyin and Jin, Long and Fan, Xuepeng and Zhou, Yuanyuan and Pasupathy, Shankar and Talwadker, Rukma},
  date = {2015},
  series = {{{ESEC}}/{{FSE}} 2015},
  pages = {307--319},
  publisher = {{Association for Computing Machinery}},
  location = {{New York, NY, USA}},
  doi = {10.1145/2786805.2786852},
  url = {https://doi.org/10.1145/2786805.2786852},
  abstract = {Configuration problems are not only prevalent, but also severely impair the reliability of today's system software. One fundamental reason is the ever-increasing complexity of configuration, reflected by the large number of configuration parameters ("knobs"). With hundreds of knobs, configuring system software to ensure high reliability and performance becomes a daunting, error-prone task. This paper makes a first step in understanding a fundamental question of configuration design: "do users really need so many knobs?" To provide the quantitatively answer, we study the configuration settings of real-world users, including thousands of customers of a commercial storage system (Storage-A), and hundreds of users of two widely-used open-source system software projects. Our study reveals a series of interesting findings to motivate software architects and developers to be more cautious and disciplined in configuration design. Motivated by these findings, we provide a few concrete, practical guidelines which can significantly reduce the configuration space. Take Storage-A as an example, the guidelines can remove 51.9\% of its parameters and simplify 19.7\% of the remaining ones with little impact on existing users. Also, we study the existing configuration navigation methods in the context of "too many knobs" to understand their effectiveness in dealing with the over-designed configuration, and to provide practices for building navigation support in system software.},
  isbn = {978-1-4503-3675-8},
  pagetotal = {13},
  keywords = {Complexity,Configuration,Difficulty,Error,Navigation,Parameter,Simplification}
}

@unpublished{Xu2016,
  title = {Using {{Social Dynamics}} to {{Make Individual Predictions}}: {{Variational Inference}} with a {{Stochastic Kinetic Model}}},
  author = {Xu, Zhen and Dong, Wen and Srihari, Sargur},
  date = {2016},
  eprint = {1611.02181},
  eprinttype = {arxiv},
  issn = {10495258},
  url = {http://arxiv.org/abs/1611.02181},
  abstract = {Social dynamics is concerned primarily with interactions among individuals and the resulting group behaviors, modeling the temporal evolution of social systems via the interactions of individuals within these systems. In particular, the availability of large-scale data from social networks and sensor networks offers an unprecedented opportunity to predict state-changing events at the individual level. Examples of such events include disease transmission, opinion transition in elections, and rumor propagation. Unlike previous research focusing on the collective effects of social systems, this study makes efficient inferences at the individual level. In order to cope with dynamic interactions among a large number of individuals, we introduce the stochastic kinetic model to capture adaptive transition probabilities and propose an efficient variational inference algorithm the complexity of which grows linearly --- rather than exponentially --- with the number of individuals. To validate this method, we have performed epidemic-dynamics experiments on wireless sensor network data collected from more than ten thousand people over three years. The proposed algorithm was used to track disease transmission and predict the probability of infection for each individual. Our results demonstrate that this method is more efficient than sampling while nonetheless achieving high accuracy.},
  isbn = {9781369593006},
  issue = {Nips},
  file = {/Users/ryedida/Zotero/storage/UBU5X3QD/Xu, Dong, Srihari - 2016 - Using Social Dynamics to Make Individual Predictions Variational Inference with a Stochastic Kinetic Model(2).pdf}
}

@article{xuPost2VecLearningDistributed2021,
  title = {{{Post2Vec}}: {{Learning Distributed Representations}} of {{Stack Overflow Posts}}},
  shorttitle = {{{Post2Vec}}},
  author = {Xu, Bowen and Hoang, Thong and Sharma, Abhishek and Yang, Chengran and Xia, Xin and Lo, David},
  date = {2021},
  journaltitle = {IEEE Transactions on Software Engineering},
  pages = {1--1},
  issn = {1939-3520},
  doi = {10.1109/TSE.2021.3093761},
  abstract = {Past studies have proposed solutions that analyze Stack Overflow content to help users find desired information or aid various downstream software engineering tasks. A common step performed by those solutions is to extract suitable representations ofposts; typically, in the form of meaningful vectors. These vectors are then used for different tasks, for example, tag recommendation, relatedness prediction, post classification, and API recommendation. Intuitively, the quality of the vector representations of posts determines the effectiveness of the solutions in performing the respective tasks. In this work, to aid existing studies that analyze Stack Overflow posts, we propose a specialized deep learning architecture Post2Vec which extracts distributed representations of Stack Overflow posts. Post2Vec is aware of different types of content present in Stack Overflow posts, i.e., title, description, and code snippets, and integrates them seamlessly to learn post representations. Tags provided by Stack Overflow users that serve as a common vocabulary that captures the semantics of posts are used to guide Post2Vec in its task. To evaluate the quality of Post2Vec’s deep learning architecture, we first investigate its end-to-end effectiveness in tag recommendation task. The results are compared to those of state-of-the-art tag recommendation approaches that also employ deep neural networks. We observe that Post2Vec achieves 15-25\% improvement in terms of F1-score@5 at a lower computational cost. Moreover, to evaluate the value of representations learned by Post2Vec, we use them for three other tasks, i.e., relatedness prediction, post classification, and API recommendation. We demonstrate that the representations can be used to boost the effectiveness of state-of-the-art solutions for the three tasks by substantial margins (by 10\%, 7\%, and 10\% in terms of F1-score, F1-score, and correctness, respectively). We release our replication package at https://github.com/maxxbw/Post2Vec.},
  eventtitle = {{{IEEE Transactions}} on {{Software Engineering}}},
  keywords = {Computational modeling,Computer architecture,Deep learning,Encoding,Feature extraction,Semantics,Task analysis},
  annotation = {1 citations (Semantic Scholar/DOI) [2021-12-04]},
  file = {/Users/ryedida/Zotero/storage/624WU6F8/Xu et al. - 2021 - Post2Vec Learning Distributed Representations of .pdf;/Users/ryedida/Zotero/storage/4KCA97MJ/9469219.html}
}

@article{yanCommutabilityEssentialInfimum1985,
  title = {On the Commutability of Essential Infimum and Conditional Expectation Operators},
  author = {Yan, Jia-An},
  date = {1985},
  journaltitle = {Chinese Science Bulletin},
  volume = {30},
  number = {8},
  pages = {1013--1018},
  file = {/Users/ryedida/Zotero/storage/U5DFMSRK/Yan_1985_On the commutability of essential infimum and conditional expectation operators.pdf}
}

@unpublished{Yang2018,
  title = {Deep {{Neural Decision Trees}}},
  author = {Yang, Yongxin and Morillo, Irene Garcia and Hospedales, Timothy M.},
  date = {2018},
  eprint = {1806.06988},
  eprinttype = {arxiv},
  url = {http://arxiv.org/abs/1806.06988},
  abstract = {Deep neural networks have been proven powerful at processing perceptual data, such as images and audio. However for tabular data, tree-based models are more popular. A nice property of tree-based models is their natural interpretability. In this work, we present Deep Neural Decision Trees (DNDT) -- tree models realised by neural networks. A DNDT is intrinsically interpretable, as it is a tree. Yet as it is also a neural network (NN), it can be easily implemented in NN toolkits, and trained with gradient descent rather than greedy splitting. We evaluate DNDT on several tabular datasets, verify its efficacy, and investigate similarities and differences between DNDT and vanilla decision trees. Interestingly, DNDT self-prunes at both split and feature-level.},
  issue = {Whi},
  file = {/Users/ryedida/Zotero/storage/8HM27DDA/Yang, Morillo, Hospedales - 2018 - Deep Neural Decision Trees(2).pdf}
}

@article{yang2021learning,
  title = {Learning to Recognize Actionable Static Code Warnings (Is Intrinsically Easy)},
  author = {Yang, Xueqi and Chen, Jianfeng and Yedida, Rahul and Yu, Zhe and Menzies, Tim},
  date = {2021},
  journaltitle = {Empirical Software Engineering},
  volume = {26},
  pages = {1--24},
  publisher = {{Springer}}
}

@article{yang2021understanding,
  title = {Understanding Static Code Warnings: {{An}} Incremental {{AI}} Approach},
  author = {Yang, Xueqi and Yu, Zhe and Wang, Junjie and Menzies, Tim},
  date = {2021},
  journaltitle = {Expert Systems with Applications},
  volume = {167},
  pages = {114134},
  publisher = {{Elsevier}}
}

@article{yangCloserLookAccuracya,
  title = {A {{Closer Look}} at {{Accuracy}} vs. {{Robustness}}},
  author = {Yang, Yao-Yuan and Rashtchian, Cyrus and Zhang, Hongyang and Salakhutdinov, Ruslan and Chaudhuri, Kamalika},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/GYUNXPWX/Yang et al. - A Closer Look at Accuracy vs. Robustness.pdf}
}

@unpublished{yangMENetEffectiveAdversarial2019,
  title = {{{ME-Net}}: {{Towards Effective Adversarial Robustness}} with {{Matrix Estimation}}},
  shorttitle = {{{ME-Net}}},
  author = {Yang, Yuzhe and Zhang, Guo and Katabi, Dina and Xu, Zhi},
  date = {2019-05-28},
  eprint = {1905.11971},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1905.11971},
  urldate = {2021-03-28},
  abstract = {Deep neural networks are vulnerable to adversarial attacks. The literature is rich with algorithms that can easily craft successful adversarial examples. In contrast, the performance of defense techniques still lags behind. This paper proposes ME-Net, a defense method that leverages matrix estimation (ME). In ME-Net, images are preprocessed using two steps: first pixels are randomly dropped from the image; then, the image is reconstructed using ME. We show that this process destroys the adversarial structure of the noise, while re-enforcing the global structure in the original image. Since humans typically rely on such global structures in classifying images, the process makes the network mode compatible with human perception. We conduct comprehensive experiments on prevailing benchmarks such as MNIST, CIFAR-10, SVHN, and Tiny-ImageNet. Comparing ME-Net with state-of-the-art defense mechanisms shows that ME-Net consistently outperforms prior techniques, improving robustness against both black-box and white-box attacks.},
  langid = {english},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Statistics - Machine Learning},
  annotation = {52 citations (Semantic Scholar/arXiv) [2021-03-28]},
  file = {/Users/ryedida/Zotero/storage/FM8HE6XJ/Yang et al. - 2019 - ME-Net Towards Effective Adversarial Robustness w.pdf}
}

@article{yangSemisupervisedLogbasedAnomaly2021,
  title = {Semi-Supervised {{Log-based Anomaly Detection}} via {{Probabilistic Label Estimation}}},
  author = {Yang, Lin and {Junjie Chen} and Wang, Zan and Wang, Weijing and Jiang, Jiajung and Dong, Xuyuan and Zhang, Wenbin},
  date = {2021},
  journaltitle = {International Conference on Software Engineering},
  shortjournal = {ICSE},
  url = {https://drive.google.com/file/d/1H4p-fv1KY81HfbCDsrf3tX8ZP2p7xqp8/view},
  abstract = {With the growth of software systems, logs have become an important data to aid system maintenance. Log-based anomaly detection is one of the most important methods for such purpose, which aims to automatically detect system anomalies via log analysis. However, existing log-based anomaly detection approaches still suffer from practical issues due to either depending on a large amount of manually labeled training data (supervised approaches) or unsatisfactory performance without learning the knowledge on historical anomalies (unsupervised and semi-supervised approaches). In this paper, we propose a novel practical log-based anomaly detection approach, PLELog, which is semi-supervised to get rid of time-consuming manual labeling and incorporates the knowledge on historical anomalies via probabilistic label estimation to bring supervised approaches’ superiority into play. In addition, PLELog is able to stay immune to unstable log data via semantic embedding and detect anomalies efficiently and effectively by designing an attention-based GRU neural network. We evaluated PLELog on two most widely-used public datasets, and the results demonstrate the effectiveness of PLELog, significantly outperforming the compared approaches with an average of 181.6\% improvement in terms of F1-score. In particular, PLELog has been applied to two real-world systems from our university and a large corporation, further demonstrating its practicability.},
  file = {/Users/ryedida/Downloads/ICSE21-PLELog.pdf}
}

@unpublished{yaoAutomatedDiscoveryAdaptive2021,
  title = {Automated {{Discovery}} of {{Adaptive Attacks}} on {{Adversarial Defenses}}},
  author = {Yao, Chengyuan and Bielik, Pavol and Tsankov, Petar and Vechev, Martin},
  date = {2021-02-27},
  eprint = {2102.11860},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/2102.11860},
  urldate = {2021-03-24},
  abstract = {Reliable evaluation of adversarial defenses is a challenging task, currently limited to an expert who manually crafts attacks that exploit the defense’s inner workings, or to approaches based on ensemble of fixed attacks, none of which may be effective for the specific defense at hand. Our key observation is that custom attacks are composed from a set of reusable building blocks, such as fine-tuning relevant attack parameters, network transformations, and custom loss functions. Based on this observation, we present an extensible framework that defines a search space over these reusable building blocks and automatically discovers an effective attack on a given model with an unknown defense by searching over suitable combinations of these blocks. We evaluated our framework on 23 adversarial defenses and showed it outperforms AutoAttack (Croce \& Hein, 2020b), the current state-of-the-art tool for reliable evaluation of adversarial defenses: our discovered attacks are either stronger, producing 3.0\%-50.8\% additional adversarial examples (10 cases), or are typically 2x faster while enjoying similar adversarial robustness (13 cases).},
  langid = {english},
  keywords = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/CIXUSENL/Yao et al. - 2021 - Automated Discovery of Adaptive Attacks on Adversa.pdf}
}

@article{yaoREACSYNERGIZINGREASONING2023,
  title = {{{REAC T}}: {{SYNERGIZING REASONING AND ACTING IN LANGUAGE MODELS}}},
  author = {Yao, Shunyu and Zhao, Jeffrey and Yu, Dian and Du, Nan and Shafran, Izhak and Narasimhan, Karthik and Cao, Yuan},
  date = {2023},
  abstract = {While large language models (LLMs) have demonstrated impressive performance across tasks in language understanding and interactive decision making, their abilities for reasoning (e.g. chain-of-thought prompting) and acting (e.g. action plan generation) have primarily been studied as separate topics. In this paper, we explore the use of LLMs to generate both reasoning traces and task-specific actions in an interleaved manner, allowing for greater synergy between the two: reasoning traces help the model induce, track, and update action plans as well as handle exceptions, while actions allow it to interface with and gather additional information from external sources such as knowledge bases or environments. We apply our approach, named ReAct, to a diverse set of language and decision making tasks and demonstrate its effectiveness over state-of-the-art baselines in addition to improved human interpretability and trustworthiness. Concretely, on question answering (HotpotQA) and fact verification (Fever), ReAct overcomes prevalent issues of hallucination and error propagation in chain-of-thought reasoning by interacting with a simple Wikipedia API, and generating human-like task-solving trajectories that are more interpretable than baselines without reasoning traces. Furthermore, on two interactive decision making benchmarks (ALFWorld and WebShop), ReAct outperforms imitation and reinforcement learning methods by an absolute success rate of 34\% and 10\% respectively, while being prompted with only one or two in-context examples.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/W3ELFDRB/Yao et al. - 2023 - REAC T SYNERGIZING REASONING AND ACTING IN LANGUA.pdf}
}

@report{Yedida,
  title = {Employee {{Attrition Prediction}}},
  author = {Yedida, Rahul and Reddy, Rahul and Vahi, Rakshit and Kulkarni, Deepti},
  abstract = {This project aims to predict whether an employee of a company will leave or not, using the k-Nearest Neighbors algorithm. We use evaluation of employee performance, average monthly hours at work and number of years spent in the company, among others, as our features. Other approaches to this problem include the use of ANNs, decision trees and logistic regression. The dataset was split, using 70\% for training the algorithm and 30\% for testing it, achieving an accuracy of 94.32\%.},
  keywords = {employee attrition,Index Terms-Predictive analysis,k-Nearest Neighbors,scikit-learn},
  file = {/Users/ryedida/Zotero/storage/7FEM5PN4/Yedida et al. - Unknown - Employee Attrition Prediction(2).pdf}
}

@article{yedida2021lipschitzlr,
  title = {Lipschitzlr: {{Using}} Theoretically Computed Adaptive Learning Rates for Fast Convergence},
  author = {Yedida, Rahul and Saha, Snehanshu and Prashanth, Tejas},
  date = {2021},
  journaltitle = {Applied Intelligence},
  volume = {51},
  pages = {1460--1478},
  publisher = {{Springer}}
}

@article{yedidaExpertSystemRedesigning2023,
  title = {An Expert System for Redesigning Software for Cloud Applications},
  author = {Yedida, Rahul and Krishna, Rahul and Kalia, Anup and Menzies, Tim and Xiao, Jin and Vukovic, Maja},
  date = {2023-06-01},
  journaltitle = {Expert Systems with Applications},
  shortjournal = {Expert Systems with Applications},
  volume = {219},
  pages = {119673},
  issn = {0957-4174},
  doi = {10.1016/j.eswa.2023.119673},
  url = {https://www.sciencedirect.com/science/article/pii/S0957417423001744},
  urldate = {2023-10-10},
  abstract = {Cloud-based software has many advantages. When services are divided into many independent components, they are easier to update. Also, during peak demand, it is easier to scale cloud services (just hire more CPUs). Hence, many organizations are partitioning their monolithic enterprise applications into cloud-based microservices. Recently there has been much work using machine learning to simplify this partitioning task. Despite much research, no single partitioning method can be recommended as generally useful. More specifically, those prior solutions are “brittle”; i.e. if they work well for one kind of goal in one dataset, then they can be sub-optimal if applied to many datasets and multiple goals. This work extends prior work and proposes DEEPLY~to fix the brittleness problem. Specifically, we use (a) hyper-parameter optimization to sample from the Pareto frontier of configurations (b) a weighted loss to choose optimally from this Pareto frontier (c) the 1cycle learning rate policy to avoid local minima with Adam and (d) spectral clustering over k-means. Our work shows that DEEPLY~outperforms other algorithms in this space across different metrics. Moreover, our ablation study reveals that of the changes, the weighted loss is the most important, followed by hyper-parameter optimization (contrary to prior belief). To enable the reuse of this research, DEEPLY~is available on-line at .},
  keywords = {Deep learning,Hyper-parameter optimization,Microservices,Refactoring,Software engineering},
  file = {/Users/ryedida/Zotero/storage/YA4VCINK/Yedida et al_2023_An expert system for redesigning software for cloud applications.pdf;/Users/ryedida/Zotero/storage/5B2DDA9J/S0957417423001744.html}
}

@article{yedidaHowFindActionable2023,
  title = {How to {{Find Actionable Static Analysis Warnings}}: {{A Case Study With FindBugs}}},
  shorttitle = {How to {{Find Actionable Static Analysis Warnings}}},
  author = {Yedida, Rahul and Kang, Hong Jin and Tu, Huy and Yang, Xueqi and Lo, David and Menzies, Tim},
  date = {2023-04},
  journaltitle = {IEEE Transactions on Software Engineering},
  volume = {49},
  number = {4},
  pages = {2856--2872},
  issn = {1939-3520},
  doi = {10.1109/TSE.2023.3234206},
  abstract = {Automatically generated static code warnings suffer from a large number of false alarms. Hence, developers only take action on a small percent of those warnings. To better predict which static code warnings should not be ignored, we suggest that analysts need to look deeper into their algorithms to find choices that better improve the particulars of their specific problem. Specifically, we show here that effective predictors of such warnings can be created by methods that locally adjust the decision boundary (between actionable warnings and others). These methods yield a new high water-mark for recognizing actionable static code warnings. For eight open-source Java projects (cassandra, jmeter, commons, lucene-solr, maven, ant, tomcat, derby) we achieve perfect test results on 4/8 datasets and, overall, a median AUC (area under the true negatives, true positives curve) of 92\%.},
  eventtitle = {{{IEEE Transactions}} on {{Software Engineering}}},
  keywords = {Codes,Computer bugs,false alarms,hyperparameter optimization,Industries,locality,Measurement,Software analytics,Source coding,static analysis,Static analysis,Training},
  annotation = {0 citations (Semantic Scholar/DOI) [2023-04-21]},
  file = {/Users/ryedida/Zotero/storage/4DK9X78Q/Yedida et al. - 2023 - How to Find Actionable Static Analysis Warnings A.pdf;/Users/ryedida/Zotero/storage/8Y4CLQZU/Yedida et al. - 2023 - How to Find Actionable Static Analysis Warnings A.pdf;/Users/ryedida/Zotero/storage/BSK6QV3Y/10012496.html}
}

@inproceedings{yedidaHowImproveDeep2022,
  title = {How to Improve Deep Learning for Software Analytics: (A Case Study with Code Smell Detection)},
  shorttitle = {How to Improve Deep Learning for Software Analytics},
  booktitle = {Proceedings of the 19th {{International Conference}} on {{Mining Software Repositories}}},
  author = {Yedida, Rahul and Menzies, Tim},
  date = {2022-05-23},
  pages = {156--166},
  publisher = {{ACM}},
  location = {{Pittsburgh Pennsylvania}},
  doi = {10.1145/3524842.3528458},
  url = {https://dl.acm.org/doi/10.1145/3524842.3528458},
  urldate = {2023-10-10},
  abstract = {To reduce technical debt and make code more maintainable, it is important to be able to warn programmers about code smells. State-of-the-art code small detectors use deep learners, usually without exploring alternatives. For example, one promising alternative is GHOST (from TSE’21) that relies on a combination of hyper-parameter optimization of feedforward neural networks and a novel oversampling technique.},
  eventtitle = {{{MSR}} '22: 19th {{International Conference}} on {{Mining Software Repositories}}},
  isbn = {978-1-4503-9303-4},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/BAZ439T9/Yedida and Menzies - 2022 - How to improve deep learning for software analytic.pdf}
}

@article{yedidaSMOOTHIETheoryHyperparameter,
  title = {{{SMOOTHIE}}: {{A Theory}} of {{Hyper-parameter Optimization}} for {{Software Analytics}}},
  author = {Yedida, Rahul and Menzies, Tim},
  journaltitle = {IEEE TRANSACTIONS ON SOFTWARE ENGINEERING},
  abstract = {Hyper-parameter optimization is the black art of tuning a learner’s control parameters. In software analytics, a repeated result is that such tuning can result in dramatic performance improvements. Despite this, hyper-parameter optimization is often applied rarely or poorly in software analytics– perhaps due to the CPU cost of exploring all those parameter options can be prohibitive. We theorize that learners generalize better when the loss landscape is “smooth”. This theory is useful since the influence on “smoothness” of different hyper-parameter choices can be tested very quickly (e.g. for a deep learner, after just one epoch). To test this theory, this paper implements and tests SMOOTHIE, a novel hyper-parameter optimizer that guides its optimizations via considerations of “smothness”. The experiments of this paper test SMOOTHIE on numerous SE tasks including (a) GitHub issue lifetime prediction; (b) detecting false alarms in static code warnings; (c) defect prediction, and (d) a set of standard ML datasets. In all these experiments, SMOOTHIE out-performed state-of-the-art optimizers. Better yet, SMOOTHIE ran 300\% faster than the prior state-of-the art. We hence conclude that this theory (that hyper-parameter optimization is best viewed as a “smoothing” function for the decision landscape), is both theoretically interesting and practically very useful.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/CDPNUCB3/Yedida and Menzies - SMOOTHIE A Theory of Hyper-parameter Optimization.pdf}
}

@unpublished{yedidaValueOversamplingDeep2021,
  title = {On the {{Value}} of {{Oversampling}} for {{Deep Learning}} in {{Software Defect Prediction}}},
  author = {Yedida, Rahul and Menzies, Tim},
  date = {2021-03-18},
  eprint = {2008.03835},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2008.03835},
  urldate = {2021-04-13},
  abstract = {One truism of deep learning is that the automatic feature engineering (seen in the first layers of those networks) excuses data scientists from performing tedious manual feature engineering prior to running DL. For the specific case of deep learning for defect prediction, we show that that truism is false. Specifically, when we pre-process data with a novel oversampling technique called fuzzy sampling, as part of a larger pipeline called GHOST (Goal-oriented Hyper-parameter Optimization for Scalable Training), then we can do significantly better than the prior DL state of the art in 14/20 defect data sets. Our approach yields state-of-the-art results significantly faster deep learners. These results present a cogent case for the use of oversampling prior to applying deep learning on software defect prediction datasets.},
  langid = {english},
  keywords = {Computer Science - Software Engineering},
  annotation = {0 citations (Semantic Scholar/arXiv) [2021-04-13]},
  file = {/Users/ryedida/Zotero/storage/7Z8BBC53/Yedida and Menzies - 2021 - On the Value of Oversampling for Deep Learning in .pdf}
}

@article{yeoNewFamilyPower2000,
  title = {A New Family of Power Transformations to Improve Normality or Symmetry},
  author = {Yeo, I.-K.},
  date = {2000-12-01},
  journaltitle = {Biometrika},
  shortjournal = {Biometrika},
  volume = {87},
  number = {4},
  pages = {954--959},
  issn = {0006-3444, 1464-3510},
  doi = {10.1093/biomet/87.4.954},
  url = {https://academic.oup.com/biomet/article-lookup/doi/10.1093/biomet/87.4.954},
  urldate = {2024-01-27},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/FAQAHX2G/Yeo - 2000 - A new family of power transformations to improve n.pdf}
}

@unpublished{yinAdversarialExampleDetection2020,
  title = {Adversarial {{Example Detection}} and {{Classification With Asymmetrical Adversarial Training}}},
  author = {Yin, Xuwang and Kolouri, Soheil and Rohde, Gustavo K.},
  date = {2020-02-22},
  eprint = {1905.11475},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1905.11475},
  urldate = {2021-03-28},
  abstract = {The vulnerabilities of deep neural networks against adversarial examples have become a significant concern for deploying these models in sensitive domains. Devising a definitive defense against such attacks is proven to be challenging, and the methods relying on detecting adversarial samples are only valid when the attacker is oblivious to the detection mechanism. In this paper we first present an adversarial example detection method that provides performance guarantee to norm constrained adversaries. The method is based on the idea of training adversarial robust subspace detectors using asymmetrical adversarial training (AAT). The novel AAT objective presents a minimax problem similar to that of GANs; it has the same convergence property, and consequently supports the learning of class conditional distributions. We first demonstrate that the minimax problem could be reasonably solved by PGD attack, and then use the learned class conditional generative models to define generative detection/classification models that are both robust and more interpretable. We provide comprehensive evaluations of the above methods, and demonstrate their competitive performances and compelling properties on adversarial detection and robust classification problems.},
  langid = {english},
  keywords = {Computer Science - Cryptography and Security,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/WB6RFSYQ/Yin et al. - 2020 - Adversarial Example Detection and Classification W.pdf}
}

@report{yingHierarchicalGraphRepresentation,
  title = {Hierarchical {{Graph Representation Learning}} with {{Differentiable Pooling}}},
  author = {Ying, Rex and You, Jiaxuan and Morris, Christopher and Ren, Xiang and Hamilton, William L and Leskovec, Jure},
  abstract = {Recently, graph neural networks (GNNs) have revolutionized the field of graph representation learning through effectively learned node embeddings, and achieved state-of-the-art results in tasks such as node classification and link prediction. However, current GNN methods are inherently flat and do not learn hierarchical representations of graphs-a limitation that is especially problematic for the task of graph classification, where the goal is to predict the label associated with an entire graph. Here we propose DIFFPOOL, a differentiable graph pooling module that can generate hierarchical representations of graphs and can be combined with various graph neural network architectures in an end-to-end fashion. DIFFPOOL learns a differentiable soft cluster assignment for nodes at each layer of a deep GNN, mapping nodes to a set of clusters, which then form the coarsened input for the next GNN layer. Our experimental results show that combining existing GNN methods with DIFFPOOL yields an average improvement of 5-10\% accuracy on graph classification benchmarks, compared to all existing pooling approaches, achieving a new state-of-the-art on four out of five benchmark data sets.}
}

@report{yoshidaSpectralNormRegularization,
  title = {Spectral {{Norm Regularization}} for {{Improving}} the {{Generalizability}} of {{Deep Learning}}},
  author = {Yoshida, Yuichi and Miyato, Takeru},
  eprint = {1705.10941v1},
  eprinttype = {arxiv},
  abstract = {We investigate the generalizability of deep learning based on the sensitivity to input perturbation. We hypothesize that the high sensitivity to the perturbation of data degrades the performance on it. To reduce the sensitivity to perturbation, we propose a simple and effective regularization method, referred to as spectral norm regularization, which penalizes the high spectral norm of weight matrices in neural networks. We provide supportive evidence for the abovementioned hypothesis by experimentally confirming that the models trained using spectral norm regularization exhibit better generalizability than other baseline methods.},
  isbn = {1705.10941v1}
}

@article{Young2015,
  title = {Optimizing Deep Learning Hyper-Parameters through an Evolutionary Algorithm},
  author = {Young, Steven R. and Rose, Derek C. and Karnowski, Thomas P. and Lim, Seung Hwan and Patton, Robert M.},
  date = {2015},
  journaltitle = {Proceedings of MLHPC 2015: Machine Learning in High-Performance Computing Environments - Held in conjunction with SC 2015: The International Conference for High Performance Computing, Networking, Storage and Analysis},
  doi = {10.1145/2834892.2834896},
  abstract = {© 2015 ACM. There has been a recent surge of success in utilizing Deep Learning (DL) in imaging and speech applications for its relatively automatic feature generation and, in particular for convolutional neural networks (CNNs), high accuracy classification abilities. While these models learn their parameters through data-driven methods, model selection (as architecture construction) through hyper-parameter choices remains a tedious and highly intuition driven task. To address this, Multi-node Evolutionary Neural Networks for Deep Learning (MENNDL) is proposed as a method for automating network selection on computational clusters through hyper-parameter optimization performed via genetic algorithms.},
  isbn = {9781450340069},
  keywords = {Convolutional neural networks,Deep learning,Evolutionary algorithm,Hyper-parameter optimization},
  file = {/Users/ryedida/Zotero/storage/6VTYDY34/Young et al. - 2015 - Optimizing deep learning hyper-parameters through an evolutionary algorithm(2).pdf}
}

@unpublished{yu2020hyper,
  title = {Hyper-Parameter Optimization: {{A}} Review of Algorithms and Applications},
  author = {Yu, Tong and Zhu, Hong},
  date = {2020},
  eprint = {2003.05689},
  eprinttype = {arxiv}
}

@online{yuanAcceleratedTrainingIncrementally2023,
  title = {Accelerated {{Training}} via {{Incrementally Growing Neural Networks}} Using {{Variance Transfer}} and {{Learning Rate Adaptation}}},
  author = {Yuan, Xin and Savarese, Pedro and Maire, Michael},
  date = {2023-06-22},
  eprint = {2306.12700},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2306.12700},
  urldate = {2023-12-09},
  abstract = {We develop an approach to efficiently grow neural networks, within which parameterization and optimization strategies are designed by considering their effects on the training dynamics. Unlike existing growing methods, which follow simple replication heuristics or utilize auxiliary gradient-based local optimization, we craft a parameterization scheme which dynamically stabilizes weight, activation, and gradient scaling as the architecture evolves, and maintains the inference functionality of the network. To address the optimization difficulty resulting from imbalanced training effort distributed to subnetworks fading in at different growth phases, we propose a learning rate adaption mechanism that rebalances the gradient contribution of these separate subcomponents. Experimental results show that our method achieves comparable or better accuracy than training large fixed-size models, while saving a substantial portion of the original computation budget for training. We demonstrate that these gains translate into real wall-clock training speedups.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/ISQ8U683/Yuan et al. - 2023 - Accelerated Training via Incrementally Growing Neu.pdf}
}

@online{yuanAcceleratedTrainingIncrementally2023a,
  title = {Accelerated {{Training}} via {{Incrementally Growing Neural Networks}} Using {{Variance Transfer}} and {{Learning Rate Adaptation}}},
  author = {Yuan, Xin and Savarese, Pedro and Maire, Michael},
  date = {2023-06-22},
  eprint = {2306.12700},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2306.12700},
  urldate = {2024-01-22},
  abstract = {We develop an approach to efficiently grow neural networks, within which parameterization and optimization strategies are designed by considering their effects on the training dynamics. Unlike existing growing methods, which follow simple replication heuristics or utilize auxiliary gradient-based local optimization, we craft a parameterization scheme which dynamically stabilizes weight, activation, and gradient scaling as the architecture evolves, and maintains the inference functionality of the network. To address the optimization difficulty resulting from imbalanced training effort distributed to subnetworks fading in at different growth phases, we propose a learning rate adaption mechanism that rebalances the gradient contribution of these separate subcomponents. Experimental results show that our method achieves comparable or better accuracy than training large fixed-size models, while saving a substantial portion of the original computation budget for training. We demonstrate that these gains translate into real wall-clock training speedups.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/SNES6DHB/Yuan et al. - 2023 - Accelerated Training via Incrementally Growing Neu.pdf}
}

@article{yuanRevisitingKnowledgeDistillation,
  title = {Revisiting {{Knowledge Distillation}} via {{Label Smoothing Regularization}}},
  author = {Yuan, Li and Tay, Francis EH and Li, Guilin and Wang, Tao and Feng, Jiashi},
  pages = {9},
  abstract = {Knowledge Distillation (KD) aims to distill the knowledge of a cumbersome teacher model into a lightweight student model. Its success is generally attributed to the privileged information on similarities among categories provided by the teacher model, and in this sense, only strong teacher models are deployed to teach weaker students in practice. In this work, we challenge this common belief by following experimental observations: 1) beyond the acknowledgment that the teacher can improve the student, the student can also enhance the teacher significantly by reversing the KD procedure; 2) a poorly-trained teacher with much lower accuracy than the student can still improve the latter significantly. To explain these observations, we provide a theoretical analysis of the relationships between KD and label smoothing regularization. We prove that 1) KD is a type of learned label smoothing regularization and 2) label smoothing regularization provides a virtual teacher model for KD. From these results, we argue that the success of KD is not fully due to the similarity information between categories from teachers, but also to the regularization of soft targets, which is equally or even more important.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/WPHIYAWR/Yuan et al. - Revisiting Knowledge Distillation via Label Smooth.pdf}
}

@report{yuanSTRUCTPOOLSTRUCTUREDGRAPH,
  title = {{{STRUCTPOOL}}: {{STRUCTURED GRAPH POOLING VIA CONDITIONAL RANDOM FIELDS}}},
  author = {Yuan, Hao and Ji, Shuiwang},
  abstract = {Learning high-level representations for graphs is of great importance for graph analysis tasks. In addition to graph convolution, graph pooling is an important but less explored research area. In particular, most of existing graph pooling techniques do not consider the graph structural information explicitly. We argue that such information is important and develop a novel graph pooling technique, know as the STRUCTPOOL, in this work. We consider the graph pooling as a node clustering problem, which requires the learning of a cluster assignment matrix. We propose to formulate it as a structured prediction problem and employ conditional random fields to capture the relationships among the assignments of different nodes. We also generalize our method to incorporate graph topologi-cal information in designing the Gibbs energy function. Experimental results on multiple datasets demonstrate the effectiveness of our proposed STRUCTPOOL.}
}

@inproceedings{Yudelson2013,
  title = {Individualized Bayesian Knowledge Tracing Models},
  booktitle = {Lecture {{Notes}} in {{Computer Science}} (Including Subseries {{Lecture Notes}} in {{Artificial Intelligence}} and {{Lecture Notes}} in {{Bioinformatics}})},
  author = {Yudelson, Michael V. and Koedinger, Kenneth R. and Gordon, Geoffrey J.},
  date = {2013},
  issn = {03029743},
  doi = {10.1007/978-3-642-39112-5-18},
  abstract = {Bayesian Knowledge Tracing (BKT)[1] is a user modeling method extensively used in the area of Intelligent Tutoring Systems. In the standard BKT implementation, there are only skill-specific parameters. However, a large body of research strongly suggests that student-specific variability in the data, when accounted for, could enhance model accuracy [5,6,8]. In this work, we revisit the problem of introducing student-specific parameters into BKT on a larger scale. We show that student-specific parameters lead to a tangible improvement when predicting the data of unseen students, and that parameterizing students' speed of learning is more beneficial than parameterizing a priori knowledge. © 2013 Springer-Verlag Berlin Heidelberg.},
  isbn = {978-3-642-39111-8},
  keywords = {Bayesian knowledge tracing,Model fitting,Model selection,Student-specific model parameters},
  file = {/Users/ryedida/Zotero/storage/JGTCIGNS/Yudelson, Koedinger, Gordon - 2013 - Individualized bayesian knowledge tracing models(2).pdf}
}

@article{yuMultilayerPerceptronTrainability,
  title = {Multi-Layer {{Perceptron Trainability Explained}} via {{Variability}}},
  author = {Yu, Yueyao and Zhang, Yin},
  abstract = {Despite the tremendous successes of deep neural networks (DNNs) in various applications, many fundamental aspects of deep learning remain incompletely understood, including DNN trainability. In a trainability study, one aims to discern what makes one DNN model easier to train than another under comparable conditions. In particular, our study focuses on multilayer perceptron (MLP) models equipped with the same number of parameters. We introduce a new notion called variability to help explain the benefits of deep learning and the difficulties in training very deep MLPs. Simply put, variability of a neural network represents the richness of landscape patterns in the data space with respect to well-scaled random weights. We empirically show that variability is positively correlated to the number of activations and negatively correlated to a phenomenon called “Collapse to Constant”, which is related but not identical to the well-known vanishing gradient phenomenon. Experiments on a small stylized model problem confirm that variability can indeed accurately predict MLP trainability. In addition, we demonstrate that, as an activation function in MLP models, the absolute value function can offer better variability than the popular ReLU function can.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/N3KTJ7PB/Yu and Zhang - Multi-layer Perceptron Trainability Explained via .pdf}
}

@unpublished{yuNewDefenseAdversarial2019,
  title = {A {{New Defense Against Adversarial Images}}: {{Turning}} a {{Weakness}} into a {{Strength}}},
  shorttitle = {A {{New Defense Against Adversarial Images}}},
  author = {Yu, Tao and Hu, Shengyuan and Guo, Chuan and Chao, Wei-Lun and Weinberger, Kilian Q.},
  date = {2019-12-03},
  eprint = {1910.07629},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1910.07629},
  urldate = {2021-03-28},
  abstract = {Natural images are virtually surrounded by low-density misclassified regions that can be efficiently discovered by gradient-guided search — enabling the generation of adversarial images. While many techniques for detecting these attacks have been proposed, they are easily bypassed when the adversary has full knowledge of the detection mechanism and adapts the attack strategy accordingly. In this paper, we adopt a novel perspective and regard the omnipresence of adversarial perturbations as a strength rather than a weakness. We postulate that if an image has been tampered with, these adversarial directions either become harder to find with gradient methods or have substantially higher density than for natural images. We develop a practical test for this signature characteristic to successfully detect adversarial attacks, achieving unprecedented accuracy under the white-box setting where the adversary is given full knowledge of our detection mechanism.},
  langid = {english},
  keywords = {Computer Science - Cryptography and Security,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/525LKQHI/Yu et al. - 2019 - A New Defense Against Adversarial Images Turning .pdf}
}

@article{Zeiler2014,
  title = {Visualizing and Understanding Convolutional Networks},
  author = {Zeiler, Matthew D. and Fergus, Rob},
  date = {2014},
  journaltitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  volume = {8689 LNCS},
  eprint = {26353135},
  eprinttype = {pmid},
  pages = {818--833},
  issn = {16113349},
  doi = {10.1007/978-3-319-10590-1_53},
  abstract = {Large Convolutional Network models have recently demonstrated impressive classification performance on the ImageNet benchmark. However there is no clear understanding of why they perform so well, or how they might be improved. In this paper we address both issues. We introduce a novel visualization technique that gives insight into the function of intermediate feature layers and the operation of the classifier. We also perform an ablation study to discover the performance contribution from different model layers. This enables us to find model architectures that outperform Krizhevsky \textbackslash etal on the ImageNet classification benchmark. We show our ImageNet model generalizes well to other datasets: when the softmax classifier is retrained, it convincingly beats the current state-of-the-art results on Caltech-101 and Caltech-256 datasets.},
  isbn = {9783319105895},
  issue = {PART 1},
  file = {/Users/ryedida/Zotero/storage/DJFUTZMF/Zeiler, Fergus - 2014 - Visualizing and understanding convolutional networks(2).pdf}
}

@article{Zhang2016,
  title = {Video Summarization with Long Short-Term Memory},
  author = {Zhang, Ke and Chao, Wei Lun and Sha, Fei and Grauman, Kristen},
  date = {2016},
  journaltitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  volume = {9911 LNCS},
  eprint = {10463930},
  eprinttype = {pmid},
  pages = {766--782},
  issn = {16113349},
  doi = {10.1007/978-3-319-46478-7_47},
  abstract = {We propose a novel supervised learning technique for summarizing videos by automatically selecting keyframes or key subshots. Casting the problem as a structured prediction problem on sequential data, our main idea is to use Long Short-Term Memory (LSTM), a special type of recurrent neural networks to model the variable-range dependencies entailed in the task of video summarization. Our learning models attain the state-of-the-art results on two benchmark video datasets. Detailed analysis justifies the design of the models. In particular, we show that it is crucial to take into consideration the sequential structures in videos and model them. Besides advances in modeling techniques, we introduce techniques to address the need of a large number of annotated data for training complex learning models. There, our main idea is to exploit the existence of auxiliary annotated video datasets, albeit heterogeneous in visual styles and contents. Specifically, we show domain adaptation techniques can improve summarization by reducing the discrepancies in statistical properties across those datasets.},
  isbn = {9783319464770},
  keywords = {Long short-term memory,Video summarization},
  file = {/Users/ryedida/Zotero/storage/RK9HV4SW/Zhang et al. - 2016 - Video summarization with long short-term memory(2).pdf}
}

@article{Zhang2017,
  title = {Incorporating {{Rich Features}} into {{Deep Knowledge Tracing}}},
  author = {Zhang, Liang and Xiong, Xiaolu and Zhao, Siyuan and Botelho, Anthony and Heffernan, Neil T.},
  date = {2017},
  journaltitle = {Proceedings of the Fourth (2017) ACM Conference on Learning @ Scale - L@S '17},
  doi = {10.1145/3051457.3053976},
  abstract = {The desire to follow student learning within intelligent tutor-ing systems in near real time has led to the development of several models anticipating the correctness of the next item as students work through an assignment. Such models have included Bayesian Knowledge Tracing (BKT), Performance Factors Analysis (PFA), and more recently with developments in deep learning, Deep Knowledge Tracing (DKT). This DKT model, based on the use of a recurrent neural network, exhib-ited promising results. Thus far, however, the model has only considered the knowledge components of the problems and correctness as input, neglecting the breadth of other features collected by computer-based learning platforms. This work seeks to improve upon the DKT model by incorporating more features at the problem-level. With this higher dimensional input, an adaption to the original DKT model structure is also proposed, incorporating an auto-encoder network layer to convert the input into a low dimensional feature vector to reduce both the resource requirement and time needed to train. Experiment results show that our adapted DKT model, observ-ing more combinations of features, can effectively improve accuracy.},
  isbn = {9781450344500}
}

@article{Zhang2018,
  title = {Deeproad: {{GaN-based}} Metamorphic Testing and Input Validation Framework for Autonomous Driving Systems},
  author = {Zhang, Mengshi and Zhang, Yuqun and Zhang, Lingming and Liu, Cong and Khurshid, Sarfraz},
  date = {2018},
  journaltitle = {ASE 2018 - Proceedings of the 33rd ACM/IEEE International Conference on Automated Software Engineering},
  pages = {132--142},
  doi = {10.1145/3238147.3238187},
  abstract = {While Deep Neural Networks (DNNs) have established the fundamentals of image-based autonomous driving systems, they may exhibit erroneous behaviors and cause fatal accidents. To address the safety issues in autonomous driving systems, a recent set of testing techniques have been designed to automatically generate artificial driving scenes to enrich test suite, e.g., generating new input images transformed from the original ones. However, these techniques are insufficient due to two limitations: first, many such synthetic images often lack diversity of driving scenes, and hence compromise the resulting efficacy and reliability. Second, for machine-learning-based systems, a mismatch between training and application domain can dramatically degrade system accuracy, such that it is necessary to validate inputs for improving system robustness. In this paper, we propose DeepRoad, an unsupervised DNN-based framework for automatically testing the consistency of DNN-based autonomous driving systems and online validation. First, DeepRoad automatically synthesizes large amounts of diverse driving scenes without using image transformation rules (e.g. scale, shear and rotation). In particular, DeepRoad is able to produce driving scenes with various weather conditions (including those with rather extreme conditions) by applying Generative Adversarial Networks (GANs) along with the corresponding real-world weather scenes. Second, DeepRoad utilizes metamorphic testing techniques to check the consistency of such systems using synthetic images. Third, DeepRoad validates input images for DNN-based systems by measuring the distance of the input and training images using their VGGNet features. We implement DeepRoad to test three well-recognized DNN-based autonomous driving systems in Udacity self-driving car challenge. The experimental results demonstrate that DeepRoad can detect thousands of inconsistent behaviors for these systems, and effectively validate input images to potentially enhance the system robustness as well.},
  isbn = {9781450359375},
  keywords = {Deep neural networks,Input validation,Software testing,Test generation},
  file = {/Users/ryedida/Zotero/storage/WDVB8C36/Zhang et al. - 2018 - Deeproad GaN-based metamorphic testing and input validation framework for autonomous driving systems(2).pdf}
}

@unpublished{Zhang2019,
  title = {Fixup {{Initialization}}: {{Residual Learning Without Normalization}}},
  author = {Zhang, Hongyi and Dauphin, Yann N. and Ma, Tengyu},
  date = {2019},
  eprint = {1901.09321},
  eprinttype = {arxiv},
  pages = {1--16},
  url = {http://arxiv.org/abs/1901.09321},
  abstract = {Normalization layers are a staple in state-of-the-art deep neural network architectures. They are widely believed to stabilize training, enable higher learning rate, accelerate convergence and improve generalization, though the reason for their effectiveness is still an active research topic. In this work, we challenge the commonly-held beliefs by showing that none of the perceived benefits is unique to normalization. Specifically, we propose fixed-update initialization (Fixup), an initialization motivated by solving the exploding and vanishing gradient problem at the beginning of training via properly rescaling a standard initialization. We find training residual networks with Fixup to be as stable as training with normalization -- even for networks with 10,000 layers. Furthermore, with proper regularization, Fixup enables residual networks without normalization to achieve state-of-the-art performance in image classification and machine translation.},
  file = {/Users/ryedida/Zotero/storage/BZ5GGRDK/Zhang, Dauphin, Ma - 2019 - Fixup Initialization Residual Learning Without Normalization(2).pdf}
}

@article{zhangAdamCanConverge,
  title = {Adam {{Can Converge Without Any Modiﬁcation On Update Rules}}},
  author = {Zhang, Yushun and Chen, Congliang and Shi, Naichen and Sun, Ruoyu and Luo, Zhi-Quan},
  abstract = {Ever since Reddi et al. (2018) pointed out the divergence issue of Adam, many new variants have been designed to obtain convergence. However, vanilla Adam remains exceptionally popular and it works well in practice. Why is there a gap between theory and practice? We point out there is a mismatch between the settings of theory and practice: Reddi et al. (2018) pick the problem after picking the hyperparameters of Adam, i.e., (β1, β2); while practical applications often fix the problem first and then tune (β1, β2). Due to this observation, we conjecture that the empirical convergence can be theoretically justified, only if we change the order of picking the problem and hyperparameter. In this work, we confirm this conjecture. We prove that, wh√en the 2nd-order momentum parameter β2 is large and 1st-order momentum parameter β1 {$<$} β2 {$<$} 1, Adam converges to the neighborhood of critical points. The size of the neighborhood is propositional to the variance of stochastic gradients. Under an extra condition (strong growth condition), Adam converges to critical points. It is worth mentioning that our results cover a wide range of hyperparameters: as β2 increases, our convergence result can cover any β1 ∈ [0, 1) including β1 = 0.9, which is the default setting in deep learning libraries. To our knowledge, this is the first result showing that Adam can converge without any modification on its update rules. Further, our analysis does not require assumptions of bounded gradients or bounded 2nd-order momentum. When β2 is small, we further point out a large region of (β1, β2) combinations where Adam can diverge to infinity. Our divergence result considers the same setting (fixing the optimization problem ahead) as our convergence result, indicating that there is a phase transition from divergence to convergence when increasing β2. These positive and negative results provide suggestions on how to tune Adam hyperparame√ters: for instance, when Adam does not work well, we suggest tuning up β2 and trying β1 {$<$} β2.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/TJFEIEH3/Zhang et al. - Adam Can Converge Without Any Modiﬁcation On Updat.pdf}
}

@inproceedings{zhangAUTOTRAINERAutomaticDNN2021,
  title = {{{AUTOTRAINER}}: {{An Automatic DNN Training Problem Detection}} and {{Repair System}}},
  shorttitle = {{{AUTOTRAINER}}},
  booktitle = {2021 {{IEEE}}/{{ACM}} 43rd {{International Conference}} on {{Software Engineering}} ({{ICSE}})},
  author = {Zhang, Xiaoyu and Zhai, Juan and Ma, Shiqing and Shen, Chao},
  date = {2021-05},
  pages = {359--371},
  publisher = {{IEEE}},
  location = {{Madrid, Spain}},
  doi = {10.1109/ICSE43902.2021.00043},
  url = {https://ieeexplore.ieee.org/document/9402077/},
  urldate = {2021-05-22},
  abstract = {With machine learning models especially Deep Neural Network (DNN) models becoming an integral part of the new intelligent software, new tools to support their engineering process are in high demand. Existing DNN debugging tools are either post-training which wastes a lot of time training a buggy model and requires expertises, or limited on collecting training logs without analyzing the problem not even fixing them. In this paper, we propose Au t o Tr a in e r , a DNN training monitoring and automatic repairing tool which supports detecting and autorepairing five commonly seen training problems. During training, it periodically checks the training status and detects potential problems. Once a problem is found, Au t o Tr a in e r tries to fix it by using built-in state-of-the-art solutions. It supports various model structures and input data types, such as Convolutional Neural Networks (CNNs) for image and Recurrent Neural Networks (RNNs) for texts. Our evaluation on 6 datasets, 495 models show that Au t o Tr a in e r can effectively detect all potential problems with 100\% detection rate and no false positives. Among all models with problems, it can fix 97.33\% of them, increasing the accuracy by 47.08\% on average.},
  eventtitle = {2021 {{IEEE}}/{{ACM}} 43rd {{International Conference}} on {{Software Engineering}} ({{ICSE}})},
  isbn = {978-1-66540-296-5},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/QJ728GIS/Zhang et al. - 2021 - AUTOTRAINER An Automatic DNN Training Problem Det.pdf}
}

@article{zhangContextAwareNeuralFault2023,
  title = {Context-{{Aware Neural Fault Localization}}},
  author = {Zhang, Zhuo and Lei, Yan and Mao, Xiaoguang and Yan, Meng and Xia, Xin and Lo, David},
  date = {2023-07},
  journaltitle = {IEEE Transactions on Software Engineering},
  shortjournal = {IIEEE Trans. Software Eng.},
  volume = {49},
  number = {7},
  pages = {3939--3954},
  issn = {0098-5589, 1939-3520, 2326-3881},
  doi = {10.1109/TSE.2023.3279125},
  url = {https://ieeexplore.ieee.org/document/10132088/},
  urldate = {2023-10-06},
  abstract = {Numerous fault localization techniques identify suspicious statements potentially responsible for program failures by discovering the statistical correlation between test results (i.e., failing or passing) and the executions of the different statements of a program (i.e., covered or not covered). They rarely incorporate a failure context into their suspiciousness evaluation despite the fact that a failure context showing how a failure is produced is useful for analyzing and locating faults. Since a failure context usually contains the transitive relationships among the statements of causing a failure, its relationship complexity becomes one major obstacle for the context incorporation in suspiciousness evaluation of fault localization. To overcome the obstacle, our insight is that leveraging the promising learning ability may be a candidate solution to learn a feasible model for incorporating a failure context into fault localization. Thus, we propose a context-aware neural fault localization approach (CAN). Specifically, CAN represents the failure context by constructing a program dependency graph, which shows how a set of statements interact with each other (i.e., data and control dependencies) to cause a failure. Then, CAN utilizes graph neural networks to analyze and incorporate the context (e.g., the dependencies among the statements) into suspiciousness evaluation. Our empirical results on the 12 large-sized programs show that CAN achieves promising results (e.g., 29.23\% faults are ranked within top 5), and it significantly improves the state-of-the-art baselines with a substantial margin.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/R9PCVVT2/Zhang et al. - 2023 - Context-Aware Neural Fault Localization.pdf}
}

@report{zhangEndtoEndDeepLearning,
  title = {An {{End-to-End Deep Learning Architecture}} for {{Graph Classification}}},
  author = {Zhang, Muhan and Cui, Zhicheng and Neumann, Marion and Chen, Yixin},
  url = {www.aaai.org},
  urldate = {2021-03-13},
  abstract = {Neural networks are typically designed to deal with data in tensor forms. In this paper, we propose a novel neural network architecture accepting graphs of arbitrary structure. Given a dataset containing graphs in the form of (G, y) where G is a graph and y is its class, we aim to develop neural networks that read the graphs directly and learn a classification function. There are two main challenges: 1) how to extract useful features characterizing the rich information encoded in a graph for classification purpose, and 2) how to sequentially read a graph in a meaningful and consistent order. To address the first challenge, we design a localized graph convolution model and show its connection with two graph kernels. To address the second challenge, we design a novel SortPooling layer which sorts graph vertices in a consistent order so that traditional neural networks can be trained on the graphs. Experiments on benchmark graph classification datasets demonstrate that the proposed architecture achieves highly competitive performance with state-of-the-art graph kernels and other graph neural network methods. Moreover, the architecture allows end-to-end gradient-based training with original graphs, without the need to first transform graphs into vectors.}
}

@inproceedings{zhangLearningDeepEmbedding2017,
  title = {Learning a {{Deep Embedding Model}} for {{Zero-Shot Learning}}},
  booktitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  author = {Zhang, Li and Xiang, Tao and Gong, Shaogang},
  date = {2017-07},
  pages = {3010--3019},
  publisher = {{IEEE}},
  location = {{Honolulu, HI}},
  doi = {10.1109/CVPR.2017.321},
  url = {http://ieeexplore.ieee.org/document/8099804/},
  urldate = {2021-04-01},
  abstract = {Zero-shot learning (ZSL) models rely on learning a joint embedding space where both textual/semantic description of object classes and visual representation of object images can be projected to for nearest neighbour search. Despite the success of deep neural networks that learn an end-toend model between text and images in other vision problems such as image captioning, very few deep ZSL model exists and they show little advantage over ZSL models that utilise deep feature representations but do not learn an end-to-end embedding. In this paper we argue that the key to make deep ZSL models succeed is to choose the right embedding space. Instead of embedding into a semantic space or an intermediate space, we propose to use the visual space as the embedding space. This is because that in this space, the subsequent nearest neighbour search would suffer much less from the hubness problem and thus become more effective. This model design also provides a natural mechanism for multiple semantic modalities (e.g., attributes and sentence descriptions) to be fused and optimised jointly in an end-to-end manner. Extensive experiments on four benchmarks show that our model significantly outperforms the existing models.},
  eventtitle = {2017 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  isbn = {978-1-5386-0457-1},
  langid = {english},
  annotation = {347 citations (Semantic Scholar/DOI) [2021-04-01]},
  file = {/Users/ryedida/Zotero/storage/6MAN39DC/Zhang et al. - 2017 - Learning a Deep Embedding Model for Zero-Shot Lear.pdf}
}

@unpublished{zhangMixupEmpiricalRisk2018,
  title = {Mixup: {{Beyond Empirical Risk Minimization}}},
  shorttitle = {Mixup},
  author = {Zhang, Hongyi and Cisse, Moustapha and Dauphin, Yann N. and Lopez-Paz, David},
  date = {2018-04-27},
  eprint = {1710.09412},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1710.09412},
  urldate = {2021-04-02},
  abstract = {Large deep neural networks are powerful, but exhibit undesirable behaviors such as memorization and sensitivity to adversarial examples. In this work, we propose mixup, a simple learning principle to alleviate these issues. In essence, mixup trains a neural network on convex combinations of pairs of examples and their labels. By doing so, mixup regularizes the neural network to favor simple linear behavior in-between training examples. Our experiments on the ImageNet-2012, CIFAR-10, CIFAR-100, Google commands and UCI datasets show that mixup improves the generalization of state-of-the-art neural network architectures. We also find that mixup reduces the memorization of corrupt labels, increases the robustness to adversarial examples, and stabilizes the training of generative adversarial networks.},
  langid = {english},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  annotation = {1485 citations (Semantic Scholar/arXiv) [2021-04-02]},
  file = {/Users/ryedida/Zotero/storage/PPMLCH5E/Zhang et al. - 2018 - mixup Beyond Empirical Risk Minimization.pdf}
}

@online{zhangNewInsightsSmoothness2021,
  title = {New Insights in Smoothness and Strong Convexity with Improved Convergence of Gradient Descent},
  author = {Zhang, Lu and Wang, Jiani and Zhang, Hui},
  date = {2021-10-28},
  eprint = {2110.15470},
  eprinttype = {arxiv},
  eprintclass = {math},
  url = {http://arxiv.org/abs/2110.15470},
  urldate = {2023-12-09},
  abstract = {The starting assumptions to study the convergence and complexity of gradient-type methods may be the smoothness (also called Lipschitz continuity of gradient) and the strong convexity. In this note, we revisit these two basic properties from a new perspective that motivates their definitions and equivalent characterizations, along with an improved linear convergence of the gradient descent method.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Mathematics - Optimization and Control},
  file = {/Users/ryedida/Zotero/storage/C647IHDY/Zhang et al. - 2021 - New insights in smoothness and strong convexity wi.pdf}
}

@article{zhangSoftwareDefectPrediction,
  title = {Software Defect Prediction Based on Stacked Sparse Denoising Autoencoders and Enhanced Extreme Learning Machine},
  author = {Zhang, Nana and Ying, Shi and Zhu, Kun and Zhu, Dandan},
  journaltitle = {IET Software},
  volume = {n/a},
  number = {n/a},
  issn = {1751-8814},
  doi = {10.1049/sfw2.12029},
  url = {https://onlinelibrary.wiley.com/doi/abs/10.1049/sfw2.12029},
  urldate = {2021-09-29},
  abstract = {Software defect prediction is an important software quality assurance technique. Nevertheless, the prediction performance of the constructed model is easily susceptible to irrelevant or redundant features in the software projects and is not predominant enough. To address these two issues, a novel defect prediction model called SSEPG based on Stacked Sparse Denoising AutoEncoders (SSDAE) and Extreme Learning Maching (ELM) optimised by Particle Swarm Optimisation (PSO) and another complementary Gravitational Search Algorithm (GSA) are proposed in this paper, which has two main merits: (1) employ a novel deep neural network – SSDAE to extract new combined features, which can effectively learn the robust deep semantic feature representation. (2) integrate strong exploitation capacity of PSO with strong exploration capability of GSA to optimise the input weights and hidden layer biases of ELM, and utilise the superior discriminability of the enhanced ELM to predict the defective modules. The SSDAE is compared with eleven state-of-the-art feature extraction methods in effect and efficiency, and the SSEPG model is compared with multiple baseline models that contain five classic defect predictors and three variants across 24 software defect projects. The experimental results exhibit the superiority of the SSDAE and the SSEPG on six evaluation metrics.},
  langid = {english},
  annotation = {0 citations (Semantic Scholar/DOI) [2021-09-28]},
  file = {/Users/ryedida/Zotero/storage/UQBCUEYW/Zhang et al. - Software defect prediction based on stacked sparse.pdf;/Users/ryedida/Zotero/storage/5CXS29TI/sfw2.html}
}

@unpublished{zhangTheoreticallyPrincipledTradeoff2019,
  title = {Theoretically {{Principled Trade-off}} between {{Robustness}} and {{Accuracy}}},
  author = {Zhang, Hongyang and Yu, Yaodong and Jiao, Jiantao and Xing, Eric P. and Ghaoui, Laurent El and Jordan, Michael I.},
  date = {2019-06-24},
  eprint = {1901.08573},
  eprinttype = {arxiv},
  eprintclass = {cs, stat},
  url = {http://arxiv.org/abs/1901.08573},
  urldate = {2021-04-21},
  abstract = {We identify a trade-off between robustness and accuracy that serves as a guiding principle in the design of defenses against adversarial examples. Although this problem has been widely studied empirically, much remains unknown concerning the theory underlying this trade-off. In this work, we decompose the prediction error for adversarial examples (robust error) as the sum of the natural (classification) error and boundary error, and provide a differentiable upper bound using the theory of classification-calibrated loss, which is shown to be the tightest possible upper bound uniform over all probability distributions and measurable predictors. Inspired by our theoretical analysis, we also design a new defense method, TRADES, to trade adversarial robustness off against accuracy. Our proposed algorithm performs well experimentally in real-world datasets. The methodology is the foundation of our entry to the NeurIPS 2018 Adversarial Vision Challenge in which we won the 1st place out of \textasciitilde 2,000 submissions, surpassing the runner-up approach by 11.41\% in terms of mean 2 perturbation distance.},
  langid = {english},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  annotation = {504 citations (Semantic Scholar/arXiv) [2021-04-21]},
  file = {/Users/ryedida/Zotero/storage/Z9UJF3H5/Zhang et al. - 2019 - Theoretically Principled Trade-off between Robustn.pdf}
}

@unpublished{zhangUnderstandingDeepLearning2017,
  title = {Understanding Deep Learning Requires Rethinking Generalization},
  author = {Zhang, Chiyuan and Bengio, Samy and Hardt, Moritz and Recht, Benjamin and Vinyals, Oriol},
  date = {2017-02-26},
  eprint = {1611.03530},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/1611.03530},
  urldate = {2021-04-01},
  abstract = {Despite their massive size, successful deep artificial neural networks can exhibit a remarkably small difference between training and test performance. Conventional wisdom attributes small generalization error either to properties of the model family, or to the regularization techniques used during training.},
  langid = {english},
  keywords = {Computer Science - Machine Learning},
  annotation = {2481 citations (Semantic Scholar/arXiv) [2021-04-01]},
  file = {/Users/ryedida/Zotero/storage/QUDJT8IM/Zhang et al. - 2017 - Understanding deep learning requires rethinking ge.pdf}
}

@article{zhangUnderstandingDeepLearning2021,
  title = {Understanding Deep Learning (Still) Requires Rethinking Generalization},
  author = {Zhang, Chiyuan and Bengio, Samy and Hardt, Moritz and Recht, Benjamin and Vinyals, Oriol},
  date = {2021-03},
  journaltitle = {Communications of the ACM},
  shortjournal = {Commun. ACM},
  volume = {64},
  number = {3},
  pages = {107--115},
  issn = {0001-0782, 1557-7317},
  doi = {10.1145/3446776},
  url = {https://dl.acm.org/doi/10.1145/3446776},
  urldate = {2023-11-26},
  abstract = {Despite their massive size, successful deep artificial neural networks can exhibit a remarkably small gap between training and test performance. Conventional wisdom attributes small generalization error either to properties of the model family or to the regularization techniques used during training.},
  langid = {english},
  file = {/Users/ryedida/Zotero/storage/A6XVHIEA/Zhang et al. - 2021 - Understanding deep learning (still) requires rethi.pdf}
}

@unpublished{zhangVariancePrincipleExplains2021,
  title = {A Variance Principle Explains Why Dropout Finds Flatter Minima},
  author = {Zhang, Zhongwang and Zhou, Hanxu and Xu, Zhi-Qin John},
  date = {2021-11-01},
  eprint = {2111.01022},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2111.01022},
  urldate = {2022-01-07},
  abstract = {Although dropout has achieved great success in deep learning, little is known about how it helps the training find a good generalization solution in the highdimensional parameter space. In this work, we show that the training with dropout finds the neural network with a flatter minimum compared with standard gradient descent training. We further study the underlying mechanism of why dropout finds flatter minima through experiments. We propose a Variance Principle that the variance of a noise is larger at the sharper direction of the loss landscape. Existing works show that SGD satisfies the variance principle, which leads the training to flatter minima. Our work show that the noise induced by the dropout also satisfies the variance principle that explains why dropout finds flatter minima. In general, our work points out that the variance principle is an important similarity between dropout and SGD that lead the training to find flatter minima and obtain good generalization.},
  langid = {english},
  keywords = {Computer Science - Machine Learning},
  annotation = {1 citations (Semantic Scholar/arXiv) [2022-01-07]},
  file = {/Users/ryedida/Zotero/storage/GSETM595/Zhang et al. - 2021 - A variance principle explains why dropout finds fl.pdf}
}

@inproceedings{zhao2018deepsim,
  title = {Deepsim: Deep Learning Code Functional Similarity},
  booktitle = {Proceedings of the 2018 26th {{ACM Joint Meeting}} on {{European Software Engineering Conference}} and {{Symposium}} on the {{Foundations}} of {{Software Engineering}}},
  author = {Zhao, Gang and Huang, Jeff},
  date = {2018},
  pages = {141--151},
  publisher = {{ACM}}
}

@article{Zhou2020,
  title = {{{pbSGD}}: {{Powered Stochastic Gradient Descent Methods}} for {{Accelerated Non-Convex Optimization}}},
  author = {Zhou, Beitong and Liu, Jun and Sun, Weigao and Chen, Ruijuan and Tomlin, Claire and Yuan, Ye},
  date = {2020},
  pages = {3258--3266},
  doi = {10.24963/ijcai.2020/451},
  abstract = {We propose a novel technique for improving the stochastic gradient descent (SGD) method to train deep networks, which we term pbSGD. The proposed pbSGD method simply raises the stochastic gradient to a certain power elementwise during iterations and introduces only one additional parameter, namely, the power exponent (when it equals to 1, pbSGD reduces to SGD). We further propose pbSGD with momentum, which we term pbSGDM. The main results of this paper present comprehensive experiments on popular deep learning models and benchmark datasets. Empirical results show that the proposed pbSGD and pbSGDM obtain faster initial training speed than adaptive gradient methods, comparable generalization ability with SGD, and improved robustness to hyper-parameter selection and vanishing gradients. pbSGD is essentially a gradient modifier via a nonlinear transformation. As such, it is orthogonal and complementary to other techniques for accelerating gradient-based optimization such as learning rate schedules. Finally, we show convergence rate analysis for both pbSGD and pbSGDM methods. The theoretical rates of convergence match the best known theoretical rates of convergence for SGD and SGDM methods on nonconvex functions.},
  keywords = {Machine Learning: Deep Learning}
}

@article{zhouWhyWhatHappened2021,
  title = {Why and What Happened? {{Aiding}} Bug Comprehension with Automated Category and Causal Link Identification},
  shorttitle = {Why and What Happened?},
  author = {Zhou, Cheng and Li, Bin and Sun, Xiaobing and Bo, Lili},
  date = {2021-11},
  journaltitle = {Empirical Software Engineering},
  shortjournal = {Empir Software Eng},
  volume = {26},
  number = {6},
  pages = {118},
  issn = {1382-3256, 1573-7616},
  doi = {10.1007/s10664-021-10010-8},
  url = {https://link.springer.com/10.1007/s10664-021-10010-8},
  urldate = {2022-01-07},
  abstract = {When a new bug report is assigned to developers, they first need to understand what the bug report expresses (what) and why this bug occurs (why). To do so, developers usually explore different bug related data sources to investigate whether there are historical bugs with similar symptoms and causes related to the bug at hand. Automatic bug classification with respect to what and why information of bugs would enable developers to narrow down their search of bug resources and improve the bug fixing productivity. To achieve this goal, we propose an approach, BugClass, which applies a deep neural network classification approach based on Hierarchical Attention Networks (HAN) to automatically classify the bugs into different what and why categories by exploiting the bug repository and commit repository. Then, we explore the causal link relationship between what and why categories to further improve the accuracy of the bug classification. Experimental results demonstrate that BugClass is effective to classify the given bug reports into what and why categories, and can be also effectively used for identifying the why category for new bugs based on the causal link relations.},
  langid = {english},
  annotation = {1 citations (Semantic Scholar/DOI) [2022-01-07]},
  file = {/Users/ryedida/Zotero/storage/T3WQR5PU/Zhou et al. - 2021 - Why and what happened Aiding bug comprehension wi.pdf}
}

@article{Zhu2015,
  title = {Machine {{Teaching}}: {{An Inverse Problem}} to {{Machine Learning}} and an {{Approach Toward Optimal Education}}},
  author = {Zhu, Xiaojin},
  date = {2015},
  journaltitle = {Proceedings of the Twenty-Ninth AAAI Conference on Artificial Intelligenc},
  pages = {4083--4087},
  doi = {http://dl.acm.org/citation.cfm?id=2888288},
  abstract = {I draw the reader's attention to machine teaching, the prob-lem of finding an optimal training set given a machine learn-ing algorithm and a target model. In addition to generating fascinating mathematical questions for computer scientists to ponder, machine teaching holds the promise of enhancing ed-ucation and personnel training. The Socratic dialogue style aims to stimulate critical thinking.},
  isbn = {9781577357049},
  file = {/Users/ryedida/Zotero/storage/A25RJ87N/Zhu - 2015 - Machine Teaching An Inverse Problem to Machine Learning and an Approach Toward Optimal Education(2).pdf}
}

@article{zhuSoftwareDefectPrediction2021,
  title = {Software Defect Prediction Based on Enhanced Metaheuristic Feature Selection Optimization and a Hybrid Deep Neural Network},
  author = {Zhu, Kun and Ying, Shi and Zhang, Nana and Zhu, Dandan},
  date = {2021-10-01},
  journaltitle = {Journal of Systems and Software},
  shortjournal = {Journal of Systems and Software},
  volume = {180},
  pages = {111026},
  issn = {0164-1212},
  doi = {10.1016/j.jss.2021.111026},
  url = {https://www.sciencedirect.com/science/article/pii/S0164121221001230},
  urldate = {2021-10-23},
  abstract = {Software defect prediction aims to identify the potential defects of new software modules in advance by constructing an effective prediction model. However, the model performance is susceptible to irrelevant and redundant features. In addition, previous studies mainly use traditional data mining or machine learning techniques for defect prediction, the prediction performance is not superior enough. For the first issue, motivated by the idea of search based software engineering, we leverage the recently proposed whale optimization algorithm (WOA) and another complementary simulated annealing (SA) to construct an enhanced metaheuristic search based feature selection algorithm named EMWS, which can effectively select fewer but closely related representative features. For the second issue, we employ a hybrid deep neural network — convolutional neural network (CNN) and kernel extreme learning machine (KELM) to construct a unified defect prediction predictor called WSHCKE, which can further integrate the selected features into the abstract deep semantic features by CNN and boost the prediction performance by taking full advantage of the strong classification capacity of KELM. We conduct extensive experiments for feature selection or extraction and defect prediction across 20 widely-studied software projects on four evaluation indicators. Experimental results demonstrate the superiority of EMWS and WSHCKE.},
  langid = {english},
  keywords = {Convolutional neural network,Kernel extreme learning machine,Metaheuristic feature selection,Software defect prediction,Whale optimization algorithm},
  annotation = {0 citations (Semantic Scholar/DOI) [2021-10-23]},
  file = {/Users/ryedida/Zotero/storage/EHQV4MLU/Zhu et al. - 2021 - Software defect prediction based on enhanced metah.pdf}
}

@inproceedings{zoph2018learning,
  title = {Learning Transferable Architectures for Scalable Image Recognition},
  booktitle = {Proceedings of the {{IEEE}} Conference on Computer Vision and Pattern Recognition},
  author = {Zoph, Barret and Vasudevan, Vijay and Shlens, Jonathon and Le, Quoc V},
  date = {2018},
  pages = {8697--8710}
}

@unpublished{zou2018stochastic,
  title = {Stochastic Gradient Descent Optimizes Over-Parameterized Deep Relu Networks},
  author = {Zou, Difan and Cao, Yuan and Zhou, Dongruo and Gu, Quanquan},
  date = {2018},
  eprint = {1811.08888},
  eprinttype = {arxiv}
}

@online{zouStochasticGradientDescent2018,
  title = {Stochastic {{Gradient Descent Optimizes Over-parameterized Deep ReLU Networks}}},
  author = {Zou, Difan and Cao, Yuan and Zhou, Dongruo and Gu, Quanquan},
  date = {2018-12-27},
  eprint = {1811.08888},
  eprinttype = {arxiv},
  eprintclass = {cs, math, stat},
  url = {http://arxiv.org/abs/1811.08888},
  urldate = {2023-08-23},
  abstract = {We study the problem of training deep neural networks with Rectified Linear Unit (ReLU) activation function using gradient descent and stochastic gradient descent. In particular, we study the binary classification problem and show that for a broad family of loss functions, with proper random weight initialization, both gradient descent and stochastic gradient descent can find the global minima of the training loss for an over-parameterized deep ReLU network, under mild assumption on the training data. The key idea of our proof is that Gaussian random initialization followed by (stochastic) gradient descent produces a sequence of iterates that stay inside a small perturbation region centering around the initial weights, in which the empirical loss function of deep ReLU networks enjoys nice local curvature properties that ensure the global convergence of (stochastic) gradient descent. Our theoretical results shed light on understanding the optimization for deep learning, and pave the way for studying the optimization dynamics of training modern deep neural networks.},
  langid = {english},
  pubstate = {preprint},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Mathematics - Optimization and Control,Statistics - Machine Learning},
  file = {/Users/ryedida/Zotero/storage/9NHD8TYG/Zou et al. - 2018 - Stochastic Gradient Descent Optimizes Over-paramet.pdf}
}

@unpublished{zugnerLanguageAgnosticRepresentationLearning2021,
  title = {Language-{{Agnostic Representation Learning}} of {{Source Code}} from {{Structure}} and {{Context}}},
  author = {Zügner, Daniel and Kirschstein, Tobias and Catasta, Michele and Leskovec, Jure and Günnemann, Stephan},
  date = {2021-03-21},
  eprint = {2103.11318},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2103.11318},
  urldate = {2021-04-16},
  abstract = {Source code (Context) and its parsed abstract syntax tree (AST; Structure) are two complementary representations of the same computer program. Traditionally, designers of machine learning models have relied predominantly either on Structure or Context. We propose a new model, which jointly learns on Context and Structure of source code. In contrast to previous approaches, our model uses only language-agnostic features, i.e., source code and features that can be computed directly from the AST. Besides obtaining state-of-the-art on monolingual code summarization on all five programming languages considered in this work, we propose the first multilingual code summarization model. We show that jointly training on non-parallel data from multiple programming languages improves results on all individual languages, where the strongest gains are on low-resource languages. Remarkably, multilingual training only from Context does not lead to the same improvements, highlighting the benefits of combining Structure and Context for representation learning on code.},
  langid = {english},
  keywords = {Computer Science - Machine Learning,Computer Science - Software Engineering},
  annotation = {1 citations (Semantic Scholar/arXiv) [2021-04-16]},
  file = {/Users/ryedida/Zotero/storage/KA24TIHJ/Zügner et al. - 2021 - Language-Agnostic Representation Learning of Sourc.pdf}
}
ecosyste.ms

Data

Tools

Indexes

Applications

Experiments

Open Source Science