<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD with MathML3 v1.2 20190208//EN"  "JATS-archivearticle1-mathml3.dtd"><article article-type="research-article" dtd-version="1.2" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink"><front><journal-meta><journal-id journal-id-type="nlm-ta">elife</journal-id><journal-id journal-id-type="publisher-id">eLife</journal-id><journal-title-group><journal-title>eLife</journal-title></journal-title-group><issn pub-type="epub" publication-format="electronic">2050-084X</issn><publisher><publisher-name>eLife Sciences Publications, Ltd</publisher-name></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">63853</article-id><article-id pub-id-type="doi">10.7554/eLife.63853</article-id><article-categories><subj-group subj-group-type="display-channel"><subject>Tools and Resources</subject></subj-group><subj-group subj-group-type="heading"><subject>Neuroscience</subject></subj-group></article-categories><title-group><article-title>Automated annotation of birdsong with a neural network that segments spectrograms</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes" equal-contrib="yes" id="author-210530"><name><surname>Cohen</surname><given-names>Yarden</given-names></name><contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0002-8149-6954</contrib-id><email>yarden.j.cohen@weizmann.ac.il</email><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1"></xref><xref ref-type="fn" rid="con1"/><xref ref-type="fn" rid="conf1"/></contrib><contrib contrib-type="author" equal-contrib="yes" id="author-211818"><name><surname>Nicholson</surname><given-names>David Aaron</given-names></name><contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0002-4261-4719</contrib-id><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1"></xref><xref ref-type="fn" rid="con2"/><xref ref-type="fn" rid="conf2"/></contrib><contrib contrib-type="author" equal-contrib="yes" id="author-211819"><name><surname>Sanchioni</surname><given-names>Alexa</given-names></name><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib2"></xref><xref ref-type="other" rid="fund1"/><xref ref-type="other" rid="fund2"/><xref ref-type="fn" rid="con3"/><xref ref-type="fn" rid="conf1"/></contrib><contrib contrib-type="author" equal-contrib="yes" id="author-211820"><name><surname>Mallaber</surname><given-names>Emily K</given-names></name><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib2"></xref><xref ref-type="other" rid="fund1"/><xref ref-type="other" rid="fund2"/><xref ref-type="fn" rid="con4"/><xref ref-type="fn" rid="conf2"/></contrib><contrib contrib-type="author" equal-contrib="yes" id="author-211821"><name><surname>Skidanova</surname><given-names>Viktoriya</given-names></name><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib2"></xref><xref ref-type="other" rid="fund1"/><xref ref-type="other" rid="fund2"/><xref ref-type="fn" rid="con5"/><xref ref-type="fn" rid="conf2"/></contrib><contrib contrib-type="author" corresp="yes" id="author-211822"><name><surname>Gardner</surname><given-names>Timothy J</given-names></name><contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0002-1744-3970</contrib-id><email>timg@uoregon.edu</email><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="other" rid="fund1"/><xref ref-type="other" rid="fund2"/><xref ref-type="other" rid="fund3"/><xref ref-type="fn" rid="con6"/><xref ref-type="fn" rid="conf2"/></contrib><aff id="aff1"><label>1</label><institution-wrap><institution-id institution-id-type="ror">https://ror.org/0316ej306</institution-id><institution>Department of Brain Sciences, Weizmann Institute of Science</institution></institution-wrap><addr-line><named-content content-type="city">Rehovot</named-content></addr-line><country>Israel</country></aff><aff id="aff2"><label>2</label><institution-wrap><institution-id institution-id-type="ror">https://ror.org/03czfpz43</institution-id><institution>Biology department, Emory University</institution></institution-wrap><addr-line><named-content content-type="city">Atlanta</named-content></addr-line><country>United States</country></aff><aff id="aff3"><label>3</label><institution-wrap><institution-id institution-id-type="ror">https://ror.org/05qwgg493</institution-id><institution>Biology department, Boston University</institution></institution-wrap><addr-line><named-content content-type="city">Boston</named-content></addr-line><country>United States</country></aff><aff id="aff4"><label>4</label><institution-wrap><institution-id institution-id-type="ror">https://ror.org/0293rh119</institution-id><institution>Phil and Penny Knight Campus for Accelerating Scientific Impact, University of Oregon</institution></institution-wrap><addr-line><named-content content-type="city">Eugene</named-content></addr-line><country>United States</country></aff></contrib-group><contrib-group content-type="section"><contrib contrib-type="editor"><name><surname>Goldberg</surname><given-names>Jesse H</given-names></name><role>Reviewing Editor</role><aff><institution-wrap><institution-id institution-id-type="ror">https://ror.org/05bnh6r87</institution-id><institution>Cornell University</institution></institution-wrap><country>United States</country></aff></contrib><contrib contrib-type="senior_editor"><name><surname>Calabrese</surname><given-names>Ronald L</given-names></name><role>Senior Editor</role><aff><institution-wrap><institution-id institution-id-type="ror">https://ror.org/03czfpz43</institution-id><institution>Emory University</institution></institution-wrap><country>United States</country></aff></contrib></contrib-group><author-notes><fn fn-type="con" id="equal-contrib1"><label></label><p>These authors contributed equally to this work</p></fn><fn fn-type="con" id="equal-contrib2"><label></label><p>These authors also contributed equally to this work</p></fn></author-notes><pub-date date-type="publication" publication-format="electronic"><day>20</day><month>01</month><year>2022</year></pub-date><pub-date pub-type="collection"><year>2022</year></pub-date><volume>11</volume><elocation-id>e63853</elocation-id><history><date date-type="received" iso-8601-date="2020-10-09"><day>09</day><month>10</month><year>2020</year></date><date date-type="accepted" iso-8601-date="2022-01-19"><day>19</day><month>01</month><year>2022</year></date></history><pub-history><event><event-desc>This manuscript was published as a preprint at .</event-desc><date date-type="preprint" iso-8601-date="2020-08-28"><day>28</day><month>08</month><year>2020</year></date><self-uri content-type="preprint" xlink:href="https://doi.org/10.1101/2020.08.28.272088"/></event></pub-history><permissions><copyright-statement>© 2022, Cohen et al</copyright-statement><copyright-year>2022</copyright-year><copyright-holder>Cohen et al</copyright-holder><ali:free_to_read/><license xlink:href="http://creativecommons.org/licenses/by/4.0/"><ali:license_ref>http://creativecommons.org/licenses/by/4.0/</ali:license_ref><license-p>This article is distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License</ext-link>, which permits unrestricted use and redistribution provided that the original author and source are credited.</license-p></license></permissions><self-uri content-type="pdf" xlink:href="elife-63853-v2.pdf"/><self-uri content-type="figures-pdf" xlink:href="elife-63853-figures-v2.pdf"/><abstract><p>Songbirds provide a powerful model system for studying sensory-motor learning. However, many analyses of birdsong require time-consuming, manual annotation of its elements, called syllables. Automated methods for annotation have been proposed, but these methods assume that audio can be cleanly segmented into syllables, or they require carefully tuning multiple statistical models. Here, we present TweetyNet: a single neural network model that learns how to segment spectrograms of birdsong into annotated syllables. We show that TweetyNet mitigates limitations of methods that rely on segmented audio. We also show that TweetyNet performs well across multiple individuals from two species of songbirds, Bengalese finches and canaries. Lastly, we demonstrate that using TweetyNet we can accurately annotate very large datasets containing multiple days of song, and that these predicted annotations replicate key findings from behavioral studies. In addition, we provide open-source software to assist other researchers, and a large dataset of annotated canary song that can serve as a benchmark. We conclude that TweetyNet makes it possible to address a wide range of new questions about birdsong.</p></abstract><kwd-group kwd-group-type="author-keywords"><kwd>songbirds</kwd><kwd>machine learning algorithms</kwd><kwd>automated annotation</kwd><kwd>canaries</kwd><kwd>bengalese finches</kwd><kwd>song syntax</kwd><kwd>neural network</kwd><kwd>sound event detection</kwd></kwd-group><kwd-group kwd-group-type="research-organism"><title>Research organism</title><kwd>Other</kwd></kwd-group><funding-group><award-group id="fund1"><funding-source><institution-wrap><institution-id institution-id-type="FundRef">http://dx.doi.org/10.13039/100000065</institution-id><institution>National Institute of Neurological Disorders and Stroke</institution></institution-wrap></funding-source><award-id>R01NS104925</award-id><principal-award-recipient><name><surname>Sanchioni</surname><given-names>Alexa</given-names></name><name><surname>Mallaber</surname><given-names>Emily K</given-names></name><name><surname>Skidanova</surname><given-names>Viktoriya</given-names></name><name><surname>Gardner</surname><given-names>Timothy J</given-names></name></principal-award-recipient></award-group><award-group id="fund2"><funding-source><institution-wrap><institution-id institution-id-type="FundRef">http://dx.doi.org/10.13039/100000065</institution-id><institution>National Institute of Neurological Disorders and Stroke</institution></institution-wrap></funding-source><award-id>R24NS098536</award-id><principal-award-recipient><name><surname>Sanchioni</surname><given-names>Alexa</given-names></name><name><surname>Mallaber</surname><given-names>Emily K</given-names></name><name><surname>Skidanova</surname><given-names>Viktoriya</given-names></name><name><surname>Gardner</surname><given-names>Timothy J</given-names></name></principal-award-recipient></award-group><award-group id="fund3"><funding-source><institution-wrap><institution-id institution-id-type="FundRef">http://dx.doi.org/10.13039/100000065</institution-id><institution>National Institute of Neurological Disorders and Stroke</institution></institution-wrap></funding-source><award-id>R01NS118424</award-id><principal-award-recipient><name><surname>Gardner</surname><given-names>Timothy J</given-names></name></principal-award-recipient></award-group><funding-statement>The funders had no role in study design, data collection and interpretation, or the decision to submit the work for publication.</funding-statement></funding-group><custom-meta-group><custom-meta specific-use="meta-only"><meta-name>Author impact statement</meta-name><meta-value>A machine learning model, TweetyNet, make it possible to scale up annotation of birdsong across individual birds, days of song, and species.</meta-value></custom-meta></custom-meta-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Songbirds are an excellent model system for investigating sensory-motor learning and production of sequential behavior. Birdsong is a culturally transmitted behavior learned by imitation (<xref ref-type="bibr" rid="bib56">Mooney, 2009</xref>). Juveniles typically learn song from a tutor, like babies learning to talk. Their songs consist of vocal gestures executed in sequence (<xref ref-type="bibr" rid="bib23">Fee and Scharff, 2010</xref>). In this and many other ways, birdsong resembles speech (<xref ref-type="bibr" rid="bib12">Brainard and Doupe, 2002</xref>). A key advantage of songbirds as a model system is that birds sing spontaneously, producing hundreds of song bouts a day. Their natural behavior yields a detailed readout of how learned vocalizations are acquired during development and maintained in adulthood. Leveraging this amount of data requires methods for high-throughput automated analyses. For example, automated methods for measuring similarity of juvenile and tutor song across development (<xref ref-type="bibr" rid="bib88">Tchernichovski et al., 2000</xref>; <xref ref-type="bibr" rid="bib53">Mets and Brainard, 2018a</xref>) led to important advances in understanding the behavioral and genetic bases of how vocalizations are learned (<xref ref-type="bibr" rid="bib89">Tchernichovski et al., 2001</xref>; <xref ref-type="bibr" rid="bib54">Mets and Brainard, 2018b</xref>; <xref ref-type="bibr" rid="bib55">Mets and Brainard, 2019</xref>). However, similarly scaling up other analyses of vocal behavior is currently hindered by a lack of automated methods.</p><p>A major roadblock to scaling up many analyses is that they require researchers to annotate song. Annotation is a time-consuming process done by hand with graphical user interface (GUI) applications, for example Praat, Audacity, Chipper (<xref ref-type="bibr" rid="bib11">Boersma and Weenink, 2021</xref>; <xref ref-type="bibr" rid="bib8">Audacity Team, 2019</xref>; <xref ref-type="bibr" rid="bib80">Searfoss et al., 2020</xref>). To annotate birdsong, researchers follow a two-step process (<xref ref-type="bibr" rid="bib90">Thompson et al., 2012</xref>; <xref ref-type="bibr" rid="bib40">Kershenbaum et al., 2016</xref>). First, they segment song into units, often called syllables, and second, they assign each syllable a label. Labels correspond to a set of discrete syllable classes that a researcher defines for each individual bird. Many models and analyses rely on song annotated at the syllable level, including: statistical models of syntax (<xref ref-type="bibr" rid="bib50">Markowitz et al., 2013</xref>; <xref ref-type="bibr" rid="bib38">Jin et al., 2011</xref>; <xref ref-type="bibr" rid="bib9">Berwick et al., 2011</xref>; <xref ref-type="bibr" rid="bib34">Hedley, 2016</xref>); computational models of motor learning (<xref ref-type="bibr" rid="bib82">Sober and Brainard, 2009</xref>; <xref ref-type="bibr" rid="bib83">Sober and Brainard, 2012</xref>); and analyses that relate both acoustic features and sequencing of syllables to neural activity (<xref ref-type="bibr" rid="bib48">Leonardo and Fee, 2005</xref>; <xref ref-type="bibr" rid="bib81">Sober et al., 2008</xref>; <xref ref-type="bibr" rid="bib100">Wohlgemuth et al., 2010</xref>). As these examples demonstrate, our ability to leverage songbirds as a model system would be greatly increased if we could automate song annotation.</p><p>Many previously proposed methods for automating annotation follow the same two-step process used when annotating manually. We describe the process in more detail, as illustrated in <xref ref-type="fig" rid="fig1">Figure 1A</xref>, to make it clear how limitations can arise when automating these two steps. First, audio is segmented into syllables by applying a widely-used simple algorithm. Basically, the algorithm consists of setting a threshold on amplitude and finding each uninterrupted series of time points above that threshold. After segmentation, manual annotation proceeds with a researcher assigning labels to syllables (letters [’i’, ’a’, ’b’,…] in <xref ref-type="fig" rid="fig1">Figure 1A</xref>). We emphasize that each individual bird will have a unique song, even though songs are recognizably similar within a species, and that a researcher chooses an arbitrary set of labels for each individual’s repertoire of syllables. This means that any automated method must be capable of reliably classifying these arbitrary classes across individuals and species.</p><fig-group><fig id="fig1" position="float"><label>Figure 1.</label><caption><title>Manual annotation of birdsong.</title><p>(<bold>A</bold>) Schematic of the standard two-step process for annotating song by hand (e.g. with a GUI application). Top axes show a spectrogram generated from a brief clip of Bengalese finch song, with different syllable types. Middle and bottom axes show the steps of annotation: first, segments are extracted from song by setting a threshold (’thr.’, dashed horizontal line, bottom axes) on the amplitude and then finding continuous periods above that threshold (colored regions of amplitude trace, bottom axes). This produces segments (colored bars, middle axes) that an expert human annotator manually labels (characters above colored bars), assigning each segment to one of the syllable classes that the annotator defines for each individual bird. (<bold>B</bold>) Examples showing how the standard approach of segmenting with a fixed amplitude threshold does not work well for canary song. Above threshold amplitudes are plotted as thicker colored lines. For a fixed threshold (dotted line, bottom axes), syllables of type ’b’ are correctly segmented, but syllables of type 'a' are incorrectly segmented into two components, and syllables of type 'c' are not segmented.</p></caption><graphic mime-subtype="tiff" mimetype="image" xlink:href="elife-63853.xml.media/elife-63853-fig1-v2.jpg"/></fig><fig id="fig1s1" position="float" specific-use="child-fig"><label>Figure 1—figure supplement 1.</label><caption><title>Example of two consecutive canary phrases that differ mostly in inter-syllable gaps.</title><p>In this case, annotation methods that first segment syllables and then use acoustic parameters to classify them will introduce errors. By simultaneously learning acoustic and sequence properties, TweetyNet overcomes this weakness.</p></caption><graphic mime-subtype="tiff" mimetype="image" xlink:href="elife-63853.xml.media/elife-63853-fig1-figsupp1-v2.jpg"/></fig><fig id="fig1s2" position="float" specific-use="child-fig"><label>Figure 1—figure supplement 2.</label><caption><title>Comparison of descriptive statistics of birdsong syllables across species.</title><p>(<bold>A</bold>) The zebra finch repeating motif allows annotation by matching its template spectrogram without segmenting different syllables (colored bars). (<bold>B</bold>) Bengalese finch songs segmented to syllables shows variable transitions and changing numbers of syllable repeats. (<bold>C</bold>) A third of one domestic canary song of median duration segmented to syllables reveals repetitions (phrase) structure. (<bold>D</bold>) The median, 0.25 and 0.75 quantiles of song durations (x-axis) and number of syllables per song (y-axis) for two canary strains, Bengalese finches and Zebra finches (color coded) (<bold>E</bold>) Variable songs are not suited for template matching. Songs contain repeating sequences of syllables but because of sequence variability songs with more syllables (x-axis) share smaller sequence fractions (y-axis) (<bold>F</bold>) Distributions of syllable duration for one domestic canary. The bird had 20 different syllable types (x-axis, ordered by mean syllable duration). Box plot shows median, 0.25 and 0.75 quantiles of syllable durations. Whiskers show the entire range.</p></caption><graphic mime-subtype="tiff" mimetype="image" xlink:href="elife-63853.xml.media/elife-63853-fig1-figsupp2-v2.jpg"/></fig></fig-group><p>Previous attempts to automate the annotation of birdsong kept the segmentation and labeling steps separate, and therefore suffered from limitations in each step. Methods such as semi-automatic clustering (<xref ref-type="bibr" rid="bib13">Burkett et al., 2015</xref>; <xref ref-type="bibr" rid="bib20">Daou et al., 2012</xref>), and supervised machine learning algorithms (<xref ref-type="bibr" rid="bib91">Troyer lab, 2012</xref>; <xref ref-type="bibr" rid="bib85">Tachibana et al., 2014</xref>; <xref ref-type="bibr" rid="bib57">Nicholson, 2016</xref>), can fail when the song of a species is not reliably segmented using the standard algorithm just described. We illustrate this in <xref ref-type="fig" rid="fig1">Figure 1B</xref> with examples of song from canaries. One reason the standard algorithm does not work is that the amplitude of canary song varies so greatly that no single threshold reliably segments all syllables. Even for species where good segmenting parameters can be found, a given individual’s song will often have one or two syllable classes that require an annotator to clean up its onsets and offsets by hand. Furthermore, other sounds in the environment, such as beak clicks and movement noise, are inevitably segmented as if they were syllables. Machine learning models operating on segmented audio will happily assign these segments a syllable class, resulting in false positives. Various other statistical methods can be used to remove these false positives, such as outlier detection algorithms. In combination with such methods, supervised machine learning models have been used to successfully annotate large-scale behavioral experiments (e.g. <xref ref-type="bibr" rid="bib94">Veit et al., 2021</xref>). But these additional clean-up steps add complexity and require the researcher to perform further tuning and validation.</p><p>Automated annotation methods may also face limitations at the step of labeling segments. Many machine learning models make use of pre-defined, engineered features, that may not reliably discriminate different classes of syllables across individual birds or species. Likewise, features extracted from single syllables do not capture temporal dependencies, that if taken into account can improve the classification accuracy (<xref ref-type="bibr" rid="bib6">Anderson et al., 1996</xref>; <xref ref-type="bibr" rid="bib43">Kogan and Margoliash, 1998</xref>; <xref ref-type="bibr" rid="bib57">Nicholson, 2016</xref>). (An example where temporal features are needed is shown in <xref ref-type="fig" rid="fig1s1">Figure 1—figure supplement 1</xref>.) This issue with models that do not leverage temporal information becomes particularly important for species whose song has more variable sequencing (see <xref ref-type="fig" rid="fig1s2">Figure 1—figure supplement 2</xref>), like the Bengalese finch and canary song we study here. Such issues likely account for why there is no prior work on algorithms for automated annotation of canary song at the syllable level. Canaries have provided unique insights into neuronal regeneration, seasonality, interhemispheric coordination, hormones, and behavior (<xref ref-type="bibr" rid="bib27">Goldman and Nottebohm, 1983</xref>; <xref ref-type="bibr" rid="bib65">Nottebohm, 1981</xref>; <xref ref-type="bibr" rid="bib84">Suthers et al., 2012</xref>; <xref ref-type="bibr" rid="bib99">Wilbrecht and Kirn, 2004</xref>; <xref ref-type="bibr" rid="bib4">Alvarez-Buylla et al., 1990</xref>; <xref ref-type="bibr" rid="bib25">Gardner et al., 2005</xref>). In spite of this, canary song with its rich syllable repertoire and complex song syntax (<xref ref-type="bibr" rid="bib50">Markowitz et al., 2013</xref>; <xref ref-type="bibr" rid="bib3">Alonso et al., 2009</xref>; <xref ref-type="bibr" rid="bib7">Appeltants et al., 2005</xref>; <xref ref-type="bibr" rid="bib2">Alliende et al., 2013</xref>) remains understudied, as does the similarly complex song of many other species.</p><p>Given the limitations faced by existing methods, we sought to develop an algorithm for automated annotation of syllables that (1) does not require cleanly segmented audio when predicting labels, (2) only requires training a single model, and (3) does not rely on hand-engineered features. To meet these criteria, we developed a deep neural network that we call TweetyNet, shown in <xref ref-type="fig" rid="fig2">Figure 2</xref>. Deep neural network models have the potential to address our criteria, because they can learn features from the training data itself, and they can be designed to map directly from spectrograms of song to predicted annotations, eliminating the need to segment audio. Below we test whether TweetyNet meets our criteria. To do so, we benchmark TweetyNet on Bengalese finch and canary song. We demonstrate that TweetyNet achieves robust performance across species and individuals, whose song can vary widely even within a species, and across many bouts of song from one individual, i.e., across days of song. Using large datasets from actual behavioral experiments, we show that automated annotations produced by TweetyNet replicate key findings about the syntax of song in both species.</p><fig id="fig2" position="float"><label>Figure 2.</label><caption><title>TweetyNet operation and architecture.</title><p>(<bold>A</bold>) TweetyNet takes as input a window from a spectrogram, and produces as output an estimate of the probability that each time bin in the spectrogram window belongs to a class <inline-formula><mml:math id="inf1"><mml:mi>c</mml:mi></mml:math></inline-formula> from the set of predefined syllable classes <inline-formula><mml:math id="inf2"><mml:mi>C</mml:mi></mml:math></inline-formula>. This output is processed to generate the labeled segments that annotations are composed of: (1) We apply the argmax operation to assign each time bin the class with the highest probability. (2) We use the ‘background’ class we add during training (indicated as ‘Bg.’) to find continuous segments of syllable class labels. (3) We post-process these segments, first discarding any segment shorter than a minimum duration (dashed circle on right side) and then taking a majority vote to assign each segment a single label (dashed circles on left side). (<bold>B</bold>) TweetyNet maps inputs to outputs through a series of operations: (1) The convolutional blocks produce a set of feature maps by convolving (asterisk) their input and a set of learned filters (greyscale boxes). A max-pooling operation downsamples the feature maps in the frequency dimension. (2) The recurrent layer, designed to capture temporal dependencies, is made up of Long Short Term Memory (LSTM) units. We use a bidrectional LSTM that operates on the input sequence in both the forward (F) and backward (B) directions to produce a hidden state for each time step, modulated by learned weights in the LSTM units. (3) The hidden states are projected onto the different syllable classes by a final linear transformation, resulting in a vector of class probabilities for each time bin <inline-formula><mml:math id="inf3"><mml:mi>t</mml:mi></mml:math></inline-formula>. For further details, please see section ‘Neural network architecture’ in Materials and methods.</p></caption><graphic mime-subtype="tiff" mimetype="image" xlink:href="elife-63853.xml.media/elife-63853-fig2-v2.jpg"/></fig><sec id="s1-1"><title>Proposed model</title><p>First we describe our approach in enough detail to provide context. As shown in <xref ref-type="fig" rid="fig2">Figure 2</xref>, a TweetyNet model takes as input a window from a spectrogram of song, and produces as output a label for each time bin of that spectrogram window. Because it labels each time bin in a spectrogram, TweetyNet does not require segmented audio to predict annotations. In order to recover segments from the network output, we add a ‘background’ class to the set of syllable labels (indicated as ‘Bg.’ in <xref ref-type="fig" rid="fig2">Figure 2A</xref>). This class includes the brief quiet intervals between syllables, as well as noises, such as beak clicks, wing flaps, and non-song calls. When predicting annotations for new data, we use these time bins classified as ‘background’ to find segmented syllables in the network’s output.</p><sec id="s1-1-1"><title>Neural network architecture</title><p>The neural network architecture we developed is most closely related to those designed for event detection, as studied with audio (<xref ref-type="bibr" rid="bib10">Böck and Schedl, 2012</xref>; <xref ref-type="bibr" rid="bib68">Parascandolo et al., 2016</xref>) or video (<xref ref-type="bibr" rid="bib47">Lea et al., 2017</xref>) data, where the task is to map a time series to a sequence of segments belonging to different event classes. Like those previous works, TweetyNet’s architecture combines two structural elements commonly found in neural networks, as shown in <xref ref-type="fig" rid="fig2">Figure 2B</xref>. The first element is a convolutional block, used in computer vision tasks to learn features from images (<xref ref-type="bibr" rid="bib28">Goodfellow et al., 2016</xref>; <xref ref-type="bibr" rid="bib22">Farabet et al., 2013</xref>; <xref ref-type="bibr" rid="bib46">Krizhevsky et al., 2012</xref>). (The term ‘block’ refers to a group of operations.) The second element is a recurrent layer, often used to predict sequences (<xref ref-type="bibr" rid="bib31">Graves, 2012</xref>). Specifically, we use a bidirectional Long Short-Term Memory (LSTM) layer that has been shown to successfully learn temporal correlations present in speech (<xref ref-type="bibr" rid="bib31">Graves, 2012</xref>), music, and acoustic scenes (<xref ref-type="bibr" rid="bib10">Böck and Schedl, 2012</xref>; <xref ref-type="bibr" rid="bib68">Parascandolo et al., 2016</xref>). Importantly, we maximized the information available to the recurrent layer by choosing parameters for the pooling operation in the convolutional blocks that <italic>did not</italic> downsample in the temporal dimension. We made this choice based on previous work on automatic speech recognition (<xref ref-type="bibr" rid="bib73">Sainath et al., 2013a</xref>; <xref ref-type="bibr" rid="bib74">Sainath et al., 2013b</xref>). Please see section ‘Neural network architecture’ in Materials and methods for a more detailed description of the network architecture, parameters, and citations of relevant literature that motivated our design choices.</p></sec><sec id="s1-1-2"><title>Post-processing neural network output and converting it to annotations</title><p>In the results below, we show that we significantly reduce error by post-processing network outputs with two simple transformations. So that these results are clear, we now describe how we convert outputs to annotation, including post-processing. For each window from a spectrogram, the network outputs a matrix with shape (<inline-formula><mml:math id="inf4"><mml:mi>c</mml:mi></mml:math></inline-formula> classes <inline-formula><mml:math id="inf5"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mo>×</mml:mo><mml:mspace width="2pt"/><mml:mi>t</mml:mi></mml:mrow></mml:mstyle></mml:math></inline-formula> time bins) (ignoring the batch dimension). Values along dimension <inline-formula><mml:math id="inf6"><mml:mi>c</mml:mi></mml:math></inline-formula> are the probabilities that the network assigns to each class label. Along that dimension, we apply the <inline-formula><mml:math id="inf7"><mml:mrow><mml:mpadded width="+1.7pt"><mml:mi>arg</mml:mi></mml:mpadded><mml:mo></mml:mo><mml:mi>max</mml:mi></mml:mrow></mml:math></inline-formula> operation (‘argmax’ in <xref ref-type="fig" rid="fig2">Figure 2A</xref>) to produce a vector of length <inline-formula><mml:math id="inf8"><mml:mi>t</mml:mi></mml:math></inline-formula>, where the value in each time bin is the class label that the network estimated had the highest probability of occurring in that time bin. We recover segments from this vector by finding all uninterrupted runs of syllable labels that are bordered by bins labeled with the ‘background’ class. We consider each of these continuous runs of syllable labels to be a segment.</p><p>To clean up these segments, we apply two transformations (‘Post processing’ in <xref ref-type="fig" rid="fig2">Figure 2A</xref>). First, we remove any segment shorter than a minimum duration, specified by a user. Second, we then take a ‘majority vote’ by counting how many times each label is assigned to any time bin in a segment, and then assigning the most frequently occurring label to all time bins in the segment, overriding any others. To annotate an entire spectrogram corresponding to one bout of song, we feed consecutive windows from the spectrogram into a trained network, concatenate the output vectors of labeled timebins, and then apply the post-processing. Finally we convert the onset and offset of each predicted segment back to seconds, using the times associated with each bin in the spectrogram, and we convert the segment’s integer class label back to the character label assigned by human annotators.</p><p>Using the method just described, a single TweetyNet model trained end-to-end can successfully annotate entire bouts of song at the syllable level. We are aware of only one previous study that takes a similar approach, from <xref ref-type="bibr" rid="bib44">Koumura, 2016</xref>. That study evaluated pipelines combining a convolutional neural network for classifying spectrogram windows with additional models that learn to correctly predict sequences of labels (e.g. Hidden Markov Models). In contrast, TweetyNet is a single neural network trained end-to-end, meaning it does not require optimizing multiple models. That previous study also focused on annotating specific sequences of interest within a song. Here, our goal is to annotate entire song bouts, not specific sequences, so as to automate the process as much as possible.</p></sec></sec></sec><sec id="s2" sec-type="results"><title>Results</title><p>We assess performance of TweetyNet in two ways. First, we benchmark TweetyNet as a machine learning model, adopting good practices from that literature. We use a metric that we call the syllable error rate, by analogy with the word error rate, the standard metric for automatic speech recognition. It is an edit distance, meaning its magnitude increases with the number of edits (insertions, deletions, and substitutions) required to ‘correct’ the predicted sequence of labels so that it matches the ground truth sequence. (For specifics, see ‘Metrics’ in Materials and methods.) The edit distance is normalized, converting it into a rate, as required to measure performance across sequences of different lengths. We show syllable error rate as a percentage throughout for readability. Thus, a 1.0% syllable error rate can be thought of as ‘one edit per every 100 syllable labels’. It should be noted, though, that the syllable error rate can grow larger than 100%, for example if a predicted sequence has many more labels than the original. As results below show, this metric is very informative when benchmarking a model such as ours.</p><p>The second way we study TweetyNet’s performance is meant to align with the point-of-view of an experimentalist, who simply wants to know whether the annotations that TweetyNet produces are ‘good enough’ to answer their research question. To that end, we show that annotations predicted by trained TweetyNet models recover key findings from behavioral studies in Bengalese finches and canaries, by fitting statistical models of song syntax to predicted annotations.</p><sec id="s2-1"><title>TweetyNet avoids limitations that arise from segmenting audio</title><p>To show that TweetyNet avoids issues that result from relying on segmented audio (as described in the 1 Introduction), we compare its performance with a model that predicts labels given engineered acoustic features extracted from segmented audio. Specifically, we use a Support Vector Machine (SVM) model and pre-defined features adapted from <xref ref-type="bibr" rid="bib85">Tachibana et al., 2014</xref> as described in ‘Comparison with a Support Vector Machine model’ in Materials and methods. To compare these two models we generated learning curves, that plot performance as a function of the amount of manually-annotated training data.</p><p>The core question is: how does each model perform when applied to unlabeled data that has been pre-processed as required, <italic>without</italic> any additional inspection or arduous manual cleaning from a human expert? For the SVM, the unlabeled data is pre-processed by segmenting the audio, while for TweetyNet, the audio files are converted to spectrograms. For both models, data is pre-processed for prediction with the exact same parameters used to pre-process training data: for example, with the same amplitude threshold used to segment audio. To simulate this for the SVM, we needed to re-segment the audio of the test set, because the segments in the ground truth annotations have been carefully cleaned by expert human annotators. We then obtained SVM predictions for these ‘raw’ segments. For all training set sizes, the syllable error rate of the SVM given ‘raw’ segments was higher than the syllable error rate of TweetyNet, as can be seen by comparing the dotted orange lines and the solid blue lines in <xref ref-type="fig" rid="fig3">Figure 3</xref>.</p><fig id="fig3" position="float"><label>Figure 3.</label><caption><title>Comparison of TweetyNet with a support vector machine (SVM) model.</title><p>Plots show syllable error rate (y axis) as a function of training set size (x axis, size of training set in seconds). Syllable error rate is an edit distance computed on sequences of text labels. Here it is measured on a fixed, held-out test set (never seen by the model during training). Hues correspond to model type: TweetyNet neural network (blue) or SVM (orange). Shaded areas around lines indicate the standard deviation across song of individual birds, and across model training replicates (each trained with different subsets of data randomly drawn from a total training set, n = 4 Bengalese finches, 10 replicates per bird;n = 3 canaries, 7 replicates per bird). Line style indicates input to model: spectrogram (solid line), or segmented audio, processed in three different ways, either manually cleaned by human annotators (dashed), not cleaned at all (dotted), or cleaned with a semi-automatic approach (dot-dash).</p><p><supplementary-material id="fig3sdata1"><label>Figure 3—source data 1.</label><caption><title>Data used to generate line plots.</title></caption><media mime-subtype="octet-stream" mimetype="application" xlink:href="elife-63853-fig3-data1-v2.csv"/></supplementary-material></p></caption><graphic mime-subtype="tiff" mimetype="image" xlink:href="elife-63853.xml.media/elife-63853-fig3-v2.jpg"/></fig><p>This estimate of syllable error rate for the SVM may seem overly pessimistic. For example, an expert human annotator could remove any non-song noises between song bouts fairly efficiently. To mimic this simple cleaning step, we removed any segments in the re-segmented audio that did not occur between the first onset and the last offset in the manually annotated, ground truth data. Our intent was to remove most of the noises that a human annotator could rapidly identify, while leaving any mis-segmented syllables that the annotator would need to carefully adjust by hand. In this setting, with semi-automated clean-up of the segments, the SVM also had a higher syllable error rate than TweetyNet across all canaries, for all training set sizes (compare dash-dotted orange line in <xref ref-type="fig" rid="fig3">Figure 3</xref> with solid blue lines). For Bengalese finches, syllable error rate of the SVM started out lower than TweetyNet, but with 10 min of training data, error for TweetyNet was lower, and this difference was statistically significant (p &lt; 0.001, Wilcoxon signed-rank test). This result indicates that much of the increased syllable error rate can be attributed to imperfect segmenting of the true syllables and other noises that take place during song bouts.</p><p>We did observe that SVM models could actually achieve a very low syllable error rate, when provided with audio segments that have been manually cleaned by human annotators. SVM predictions on this perfectly clean data are lower than the syllable error rate of TweetyNet. For models trained with 10 min of data, this difference was again significant (p &lt; 0.05, Wilcoxon signed-rank test). However, if applying a machine learning model required human annotators to manually clean the segments produced from audio by the standard algorithm, it would defeat the purpose of automating annotation.</p><p>Lastly, we observed that there was a much higher standard deviation in error rate, computed across individuals and training replicates, for SVM models predicting labels for uncleaned or semi-cleaned segmented audio when compared with TweetyNet (p &lt; 0.001, Levene’s test). The standard deviation is indicated by the shaded areas in <xref ref-type="fig" rid="fig3">Figure 3</xref>. This results shows that TweetyNet performs well across random samples of each bird’s song, because each replicate was trained on a randomly drawn subset from a larger pool of training data. Our software ensured that at least one instance of each syllable class was present in those subsets (please see ’Learning curves’ for details). This result suggests that experimenters will not need to carefully construct training sets of data to fit TweetyNet models, as long as they ensure that training sets contain a minimum number of instances of each syllable class.</p></sec><sec id="s2-2"><title>Tweetynet annotates with low error rates across individuals and species</title><p>The third criterion we set out above is that our model should be capable of learning the unique song of each individual. Here we show that this criterion is met by our method achieving low error across individuals and across species. To show this, we carried out further experiments, adding song from an additional four Bengalese finches from the dataset accompanying <xref ref-type="bibr" rid="bib44">Koumura, 2016</xref> (see ‘Annotation of Bengalese finch song’ in Materials and methods for details). This gave us a dataset of song from 8 Bengalese finches recorded and annotated in two different research groups. In <xref ref-type="fig" rid="fig4">Figure 4</xref>, we show learning curves for the 8 Bengalese finches and the three canaries, this time plotting lines for each individual, to better understand how the model performs for each bird’s unique song. Here, we consider the syllable error rate as defined above, and in addition the frame error, which is the fraction of time bins classified incorrectly, displayed as a percent. Results here and in the next section will demonstrate why it is important to measure both the frame error and the syllable error rate. Across all 8 Bengalese finches, the mean syllable error rate obtained by TweetyNet was 0.9%, and the mean frame error was 1.56%. It can be seen that the model performed well across most birds and training replicates, although for two birds the syllable error rate exhibited a relatively high standard deviation for training sets of size 75 s or less. Across all individuals, it appeared that 8–10 min worth of manually annotated data was the minimal amount needed to train models achieving the lowest observed syllable error rates. For canaries, with 11 min of training data, the mean syllable error rate was 5.17%, and the mean frame error was 2.91%. It was unclear from the learning curves for canaries whether the syllable error rate of TweetyNet had reached an asymptotic value at the largest training set size. Because training models on canary song could be computationally expensive, we did not include larger data sets for these curves. To obtain an estimate of the asymptotic syllable error rate, for each bird we trained one replicate on a single dataset of 60 minutes of song (instead of training multiple replicates with randomly drawn subsets of the training data). This produced an estimated asymptotic mean syllable error rate of 3.1(± 0.2)% for TweetyNet on canary song. Taken together, these benchmarking results suggest that the syllable error rate of TweetyNet is low enough to enable automated annotation of large-scale datasets from behavioral experiments. We show this rigorously below, but first we interrogate more closely how the model achieves this performance.</p><fig id="fig4" position="float"><label>Figure 4.</label><caption><title>Performance of TweetyNet across songs of 8 Bengalese finches and three canaries.</title><p>Plots show frame error (y axis, top row) and syllable error rate (y axis, bottom row) as a function of training set size (x axis, in seconds). Frame error is simple accuracy of labels the network predicted for each time bin in spectrograms, while syllable error rate is an edit distance computed on sequences of labels for the segments that we recover from the vectors of labeled time bins (as described in main text). Thick line is mean across all individuals, thinner lines with different styles correspond to individual birds (each having a unique song). Shaded areas around lines for each bird indicate standard deviation of metric plotted across multiple training replicates, each using a different randomly-drawn subset of the training data. Metrics are computed on a fixed test set held constant across training replicates. Here hue indicates species (as in <xref ref-type="fig" rid="fig5">Figure 5A</xref> below): Bengalese finches (magenta, left column) and canaries (dark gray, right column).</p><p><supplementary-material id="fig4sdata1"><label>Figure 4—source data 1.</label><caption><title>Data used to generate plots for Bengalese finches.</title></caption><media mime-subtype="octet-stream" mimetype="application" xlink:href="elife-63853-fig4-data1-v2.csv"/></supplementary-material></p><p><supplementary-material id="fig4sdata2"><label>Figure 4—source data 2.</label><caption><title>Data used to generate plots for canaries.</title></caption><media mime-subtype="octet-stream" mimetype="application" xlink:href="elife-63853-fig4-data2-v2.csv"/></supplementary-material></p></caption><graphic mime-subtype="tiff" mimetype="image" xlink:href="elife-63853.xml.media/elife-63853-fig4-v2.jpg"/></fig></sec><sec id="s2-3"><title>Simple post-processing greatly reduces syllable error rates</title><p>One of our criteria for an automated annotation method was that it should only require training a single model. Although our approach meets this criterion, there are of course hyperparameters for training the model that we tuned during our experiments, and there is additional post-processing applied to the model outputs when converting them to annotations. (The term ‘hyperparameter’ refers to parameters that configure the model, such as the batch size during training or the size of the spectrogram windows, as opposed to the parameters in the model itself, optimized by training.) Here, we take a closer look at how post-processing and hyperparameters impact performance, to understand how TweetyNet works ‘under the hood’, and to provide a starting point for users applying the model to their own data.</p><p>As described above, the post-processing consists of first discarding any segments shorter than a minimum duration, and then taking a ‘majority vote’ within any consecutive run of labels between time bins labeled as ‘background’. To understand how this impacts performance, we computed frame error and syllable error rate with and without post-processing, as shown in <xref ref-type="fig" rid="fig5">Figure 5</xref>. We found that post-processing had little effect on the frame error (compare dashed and solid lines in <xref ref-type="fig" rid="fig5">Figure 5A</xref> top row), but that it greatly reduced the syllable error rates (bottom row). To understand this difference, we performed further analysis. We found that many of the frame errors could be attributed to disagreements between the model predictions and the ground truth annotations about the onset and offset times of syllables (see <xref ref-type="fig" rid="fig5s1">Figure 5—figure supplement 1</xref>). These syllable boundaries are naturally variable in the ground truth data, but such mismatches between the model predictions and the ground truth do not change the label assigned to a segment, and thus do not contribute to the syllable error rate. We also asked whether the increased syllable error rate might be due to errors that result when the model sees sequences of syllables that occur with very low probability. We were unable to find strong evidence that these infrequently-seen sequences caused the model to make errors. Rarely occurring sequences had little effect even when we limited the performance of our model by shrinking the size of the hidden state in the recurrent layer (see <xref ref-type="fig" rid="fig5s2">Figure 5—figure supplement 2</xref>). The results of this further analysis and the difference we observed between frame error and syllable error rate suggested to us that our post-processing corrects a small number of mislabeled frames peppered throughout the network outputs, which has a comparatively large effect on the syllable error rate.</p><fig-group><fig id="fig5" position="float"><label>Figure 5.</label><caption><title>The effect of post-processing and hyperparameters on TweetyNet performance.</title><p>(<bold>A</bold>) Mean frame error (top row) and mean syllable error rate, across all birds and training replicates, as a function of training set duration. Hue indicates species (Bengalese finches, magenta; canaries, dark gray). Line style indicates whether the metric was computed with (solid lines) or without (dashed lines) post-processing of the vectors of labeled time bins that TweetyNet produces as output. (Note solid lines are same data as <xref ref-type="fig" rid="fig4">Figure 4</xref>). (<bold>B, C</bold>). Performance for a range of values for two key hyperparameters: the size of windows from spectrograms shown to the network (<bold>B</bold>) and the size of the hidden state in the recurrent layer (<bold>C</bold>). Box-and-whisker plots show metrics computed at the maximum training set duration we used for the curves in A (‘Max. train dur.’, black arrow in A). We chose the maximum training set durations because at those metrics were closest to the asymptotic minimum approached by the learning curves. Top row of axes in both B and C shows frame error, and bottom row of axes shows syllable error rate. Blue boxes are metrics computed with post-processing transforms applied, orange boxes are error rates without those transforms. Ticks labels in boldface on axes in B and C represent the hyperparameters we used for results shown in A, and <xref ref-type="fig" rid="fig3">Figures 3</xref> and <xref ref-type="fig" rid="fig4">4</xref>.</p><p><supplementary-material id="fig5sdata1"><label>Figure 5—source data 1.</label><caption><title>Data used to generate line plots in <xref ref-type="fig" rid="fig5">Figure 5A, B</xref>, <xref ref-type="fig" rid="fig5s3">Figure 5—figure supplement 3</xref>.</title></caption><media mime-subtype="octet-stream" mimetype="application" xlink:href="elife-63853-fig5-data1-v2.csv"/></supplementary-material></p><p><supplementary-material id="fig5sdata2"><label>Figure 5—source data 2.</label><caption><title>Data used to generate box plots in <xref ref-type="fig" rid="fig5">Figure 5B, C</xref>, <xref ref-type="fig" rid="fig5s3">Figure 5—figure supplement 3</xref>.</title></caption><media mime-subtype="octet-stream" mimetype="application" xlink:href="elife-63853-fig5-data2-v2.csv"/></supplementary-material></p></caption><graphic mime-subtype="tiff" mimetype="image" xlink:href="elife-63853.xml.media/elife-63853-fig5-v2.jpg"/></fig><fig id="fig5s1" position="float" specific-use="child-fig"><label>Figure 5—figure supplement 1.</label><caption><title>Most frame errors of trained TweetyNet models are disagreement on syllable boundaries of 0–2 time bins.</title><p>Potential syllable boundary disagreements are time bins in which the ground truth test set or the trained TweetyNet model disagree and just one of them assigns the ’background’ label. The histograms show the distances of those time bins from the nearest syllable boundary in test sets 5000 second long.</p></caption><graphic mime-subtype="tiff" mimetype="image" xlink:href="elife-63853.xml.media/elife-63853-fig5-figsupp1-v2.jpg"/></fig><fig id="fig5s2" position="float" specific-use="child-fig"><label>Figure 5—figure supplement 2.</label><caption><title>frame errors in rarely-occurring Bengalese finch sequences.</title><p>Each dot represents a syllable sequence a-b-<bold>y</bold>. The x-axis shows the ratio between the frequency of a-b-<bold>y</bold> and the frequency of the most common sequence a-b-<bold>x</bold> (p(rare) and p(frequent) respectively). The y-axis shows the frame error measured in the segments <bold>y</bold> occurring in the context a-b-<bold>y</bold>. (<bold>A</bold>) TweetyNet models with very small hidden state sizes will have large error rates in some, but not all, of the rarely occurring events. This is seen in the negative Pearson correlation (top panel, r values) between the error rate (y-axis) and the relative rarity of the sequence (x-axis) and in larger fractions of above-median error rates in the more rare events compared to the more common among the data in the top panel (Bars, bottom panel. Error bars showing S.E.). These effects are almost never significant (Pearson r,p in the top panels and the binomial z-test and p values in the bottom panels) and decrease if applying the majority vote transform (orange dots) or when training the networks with more data (left to right panels showing training sets 30–600 s long). Note these results are from networks trained with a hidden state size of 16. For main results in <xref ref-type="fig" rid="fig5">Figures 5A</xref> and <xref ref-type="fig" rid="fig6">6</xref> we used a size of 256. (<bold>B</bold>) Repeats A but with hidden state size of 64, and showing an even smaller effect.</p></caption><graphic mime-subtype="tiff" mimetype="image" xlink:href="elife-63853.xml.media/elife-63853-fig5-figsupp2-v2.jpg"/></fig><fig id="fig5s3" position="float" specific-use="child-fig"><label>Figure 5—figure supplement 3.</label><caption><title>Filter size experiments.</title></caption><graphic mime-subtype="tiff" mimetype="image" xlink:href="elife-63853.xml.media/elife-63853-fig5-figsupp3-v2.jpg"/></fig><fig id="fig5s4" position="float" specific-use="child-fig"><label>Figure 5—figure supplement 4.</label><caption><title>Filter number experiments.</title></caption><graphic mime-subtype="tiff" mimetype="image" xlink:href="elife-63853.xml.media/elife-63853-fig5-figsupp4-v2.jpg"/></fig></fig-group><p>Next we sought to understand how the hyperparameters used during training affected the small number of incorrect frames that inflate the syllable error rate. We focused on two key hyperparameters we considered most likely to affect syllable error rate: the size of windows from spectrograms shown to the network (measured in the number of time bins), and the size of the hidden state in the recurrent layer. The window size determines the context the network sees, while the hidden state size determines the network’s capacity to integrate contextual information across time steps. We ran further experiments using a range of values for both hyperparameters to determine how they impact performance. In all cases, we saw that both hyperparameters had little effect on frame error (top row in <xref ref-type="fig" rid="fig5">Figure 5B and C</xref>) but a large effect on syllable error rate (bottom row in <xref ref-type="fig" rid="fig5">Figure 5B and C</xref>). This difference between metrics is again consistent with the idea that the main contributor to the syllable error rate is a handful of frame errors scattered across the network outputs. These experiments also confirmed that the values we chose to obtain results in <xref ref-type="fig" rid="fig3">Figures 3</xref> and <xref ref-type="fig" rid="fig4">4</xref> were close to optimal; smaller values would have negatively impacted performance, and larger values would have yielded little or no additional gain. (There is no widely-accepted method to find truly optimal hyperparameters.) In all cases, the effect of these hyperparameters was clear when looking at the model outputs before post-processing (orange boxes in <xref ref-type="fig" rid="fig5">Figure 5B and C</xref>). We did not see any similar effect when testing other hyperparameters such as filter size (<xref ref-type="fig" rid="fig5s2">Figure 5—figure supplement 2</xref>) and number (<xref ref-type="fig" rid="fig5s3">Figure 5—figure supplement 3</xref>). In total, these results show that our algorithm is in fact learning something about the sequences, by leveraging context from the windows that it sees and by storing information it propagates across time steps in its hidden state.</p><p>However, our simple post-processing step had a much larger effect on both error metrics, making the impact of hyperparameters difficult to see when plotted at the same scale (blue boxes in <xref ref-type="fig" rid="fig5">Figure 5B and C</xref>). Therefore, the results also demonstrate that even with well-chosen hyperparameters the network outputs contain segmenting errors that our post-processing removes. In all cases, we were able to reduce the syllable error rate by nearly an order of magnitude with post-processing. We return to this point in the discussion.</p></sec><sec id="s2-4"><title>Birdsong annotated automatically with TweetyNet replicates key behavioral findings</title><p>We next assessed performance of TweetyNet in a scenario more similar to how an experimentalist would apply our approach to their data. Specifically, we asked whether we could replicate key findings from previous behavioral experiments, using annotations predicted by TweetyNet.</p><sec id="s2-4-1"><title>TweetyNet annotation of Bengalese finch song replicates statistics of branch points</title><p>Bengalese finch song is known to contain <italic>branch points</italic>, where one class of syllable can transition to two or more other classes. An example is shown in <xref ref-type="fig" rid="fig6">Figure 6A and B</xref>. <xref ref-type="bibr" rid="bib96">Warren et al., 2012</xref> showed that these transition probabilities are stable across many days. We asked if we could replicate this result with automated annotation for several full days of recordings, predicted by TweetyNet models trained on a relatively small set of manually-annotated songs. To do so, we used the dataset from <xref ref-type="bibr" rid="bib58">Nicholson et al., 2017</xref>, that contains recordings from 4 Bengalese finches, whose every song was manually annotated across 3–4 days. We verified that in the ground truth annotations from <xref ref-type="bibr" rid="bib58">Nicholson et al., 2017</xref> we could replicate the key finding from <xref ref-type="bibr" rid="bib96">Warren et al., 2012</xref>, that branch point statistics were stable across several days (Bonferroni-corrected pairwise bootstrap test, n.s. in all cases).</p><fig id="fig6" position="float"><label>Figure 6.</label><caption><title>Replicating results on branch points in Bengalese finch song with annotations predicted by TweetyNet.</title><p>(<bold>A</bold>) Representative example of a Bengalese finch song with a <italic>branch point</italic>: the syllable labeled ’e’ can transition to either ’f’, as highlighted with blue rectangles, or to ’i’, as highlighted with an orange rectangle. (<bold>B</bold>) Transition probabilities for this branch point, computed from one day of song. (<bold>C</bold>) Syllable error rates per day for each bird from <xref ref-type="bibr" rid="bib58">Nicholson et al., 2017</xref>. Solid line is mean and shaded area is standard deviation across 10 training replicates. Line color and style indicate individual animals. TweetyNet models were trained on 10 min of manually annotated song, a random subset drawn from data for day 1. Then syllable error rates were computed for the remaining songs from day 1, and for all songs from all other days. (<bold>D</bold>) Transition probabilities across days for the branch point in A and B, computed from the ground truth annotations (solid lines) and the annotations predicted by TweetyNet (dashed lines). Shaded area around dashed lines is standard deviation of the estimated probabilities, across the 10 training replicates. (<bold>E</bold>) Group analysis of transition. x axis is probability computed from the ground truth annotations, and the y axis is probability estimated from the predicted annotations. Dashed line is ‘x = y’, for reference. Each (color, marker shape) combination represents one branch point from one bird.</p><p><supplementary-material id="fig6sdata1"><label>Figure 6—source data 1.</label><caption><title>Data used to generate line plot in <xref ref-type="fig" rid="fig6">Figure 6C</xref>.</title></caption><media mime-subtype="octet-stream" mimetype="application" xlink:href="elife-63853-fig6-data1-v2.csv"/></supplementary-material></p><p><supplementary-material id="fig6sdata2"><label>Figure 6—source data 2.</label><caption><title>Data used to generate line plot in <xref ref-type="fig" rid="fig6">Figure 6D</xref>.</title></caption><media mime-subtype="octet-stream" mimetype="application" xlink:href="elife-63853-fig6-data2-v2.csv"/></supplementary-material></p><p><supplementary-material id="fig6sdata3"><label>Figure 6—source data 3.</label><caption><title>Data used to generate scatter plot in <xref ref-type="fig" rid="fig6">Figure 6E</xref>.</title></caption><media mime-subtype="zip" mimetype="application" xlink:href="elife-63853-fig6-data3-v2.zip"/></supplementary-material></p></caption><graphic mime-subtype="tiff" mimetype="image" xlink:href="elife-63853.xml.media/elife-63853-fig6-v2.jpg"/></fig><p>Before testing whether we could recover this finding from annotations predicted by TweetyNet, we first measured model performance across entire days of song. Using models trained on 10 min (for experiments in <xref ref-type="fig" rid="fig3">Figures 3</xref> and <xref ref-type="fig" rid="fig4">4</xref>), we predicted annotations for the remainder of the songs. As shown in <xref ref-type="fig" rid="fig6">Figure 6C</xref>, we found that these TweetyNet models maintained low syllable error rates when measured with entire days of song, without exhibiting large fluctuations across days. The syllable error rate ranged from 1% to 5% across 3–4 days of song from each of the four birds, comparable to rates observed in <xref ref-type="fig" rid="fig4">Figure 4</xref>. We emphasize that the days of songs we used as test sets here are much larger than those we used to benchmark models in <xref ref-type="fig" rid="fig4">Figure 4</xref>. The mean duration of these test sets was 1528 seconds (s.d. 888.6 s, i.e. 25 min mean, 14 min s.d.), in contrast to <xref ref-type="fig" rid="fig4">Figure 4</xref> where we measured syllable error rates with a fixed test set of 400 s (6 min 40 s).</p><p>Next we asked whether we could recover the behavioral findings using annotations predicted by TweetyNet. Applying the same analysis from <xref ref-type="bibr" rid="bib96">Warren et al., 2012</xref>, we found that annotations predicted by TweetyNet were statistically indistinguishable from the ground truth data (Bonferroni-corrected pairwise bootstrap test, again n.s. in all cases). This can be seen by overlaying model and ground truth predictions, as in the representative example in <xref ref-type="fig" rid="fig6">Figure 6D</xref>. Summary results for all branch points on all days in all four birds are shown in <xref ref-type="fig" rid="fig6">Figure 6E</xref>, again illustrating that the probabilities estimated from predicted annotations were quite similar to those estimated from the ground truth.</p></sec><sec id="s2-4-2"><title>TweetyNet annotation of canary song replicates statistical models of song structure</title><p>Canary songs consist of trills of repeated syllables called phrases (<xref ref-type="fig" rid="fig1">Figure 1B</xref>). <xref ref-type="bibr" rid="bib50">Markowitz et al., 2013</xref> examined sequences of phrases of Waterslager canaries and found transitions with different memory depths. They showed this by describing probability distribution of transition outcomes from certain phrases by Markov chains with variable lengths. This syntax structure is captured parsimoniously by probabilistic suffix trees (PST) (<xref ref-type="bibr" rid="bib72">Ron et al., 1996</xref>). The root node in these graphical models, appearing in the middle of <xref ref-type="fig" rid="fig7">Figure 7A and B</xref>, represents the zero-order Markov, or base rate, frequencies of the different phrases, labeled in different colors and letters. Each branch, emanating from the colored letters in <xref ref-type="fig" rid="fig7">Figure 7</xref>, represents the set of Markov chains that end in the specific phrase type designated by that label. For example, the ’A’ branch in <xref ref-type="fig" rid="fig7">Figure 7A</xref> includes the first order Markov model ’A’ and the second order Markov chains ’FA’ and ’1A’ representing the second order dependence of the transition from phrase ’A’.</p><fig-group><fig id="fig7" position="float"><label>Figure 7.</label><caption><title>Replicating and extending results about canary syntax dependencies with annotations predicted by TweetyNet.</title><p>(<bold>A</bold>) Long-range order found in 600 domestic canary songs annotated with human proof reader (methods, similar dataset size to <xref ref-type="bibr" rid="bib50">Markowitz et al., 2013</xref>). Letters and colors indicate phrase types. Each branch terminating in a given phrase type indicates the extent to which song history impacts transition probabilities following that phrase. Each node corresponds to a phrase sequence, annotated in its title, and shows a pie chart representing the outgoing transition probabilities from that sequence (e.g. the pie ’1A’ shows the probabilities of phrases ’B’, ’C’, and ’F’ which follow the phrase sequence ’1→ A’). The nodes are scaled according to their frequency (legend). Nodes that can be grouped together (chunked as a sequence) without significantly reducing the power of the model are labeled with blue text. These models are built by iterative addition of nodes up the branch to represent longer Markov chains, or a transition's dependence on longer sequences of song history. A TweetyNet model was trained using 2.2% of 1,764 songs (9.5% of the data in A). The PST created from the model’s predicted annotation of the entire dataset is very similar to A (see full comparison in <xref ref-type="fig" rid="fig7s1">Figure 7—figure supplement 1</xref>). Here, branch differences between the hand labeled and model labeld song are marked by red and blue dashed lines for added and missed branches. (<bold>B</bold>) PST created using all 1,764 hand labeled songs. An almost identical PST was created <italic>without</italic> a human proof reader from a TweetyNet model trained on 19% of the data (see full comparison in <xref ref-type="fig" rid="fig7s2">Figure 7—figure supplement 2</xref>).</p></caption><graphic mime-subtype="tiff" mimetype="image" xlink:href="elife-63853.xml.media/elife-63853-fig7-v2.jpg"/></fig><fig id="fig7s1" position="float" specific-use="child-fig"><label>Figure 7—figure supplement 1.</label><caption><title>Detailed comparison of syntax structure in 600 hand labeled or TweetyNet-labeled canary songs.</title><p>We plot the full probabilistic suffix trees created from 600 hand labeled canary songs (<bold>A</bold>) and from the prediction of a TweetyNet model trained on 2.2% of this bird’s song (<bold>B</bold>).</p></caption><graphic mime-subtype="tiff" mimetype="image" xlink:href="elife-63853.xml.media/elife-63853-fig7-figsupp1-v2.jpg"/></fig><fig id="fig7s2" position="float" specific-use="child-fig"><label>Figure 7—figure supplement 2.</label><caption><title>Detailed comparison of syntax structure in 1764 hand labeled or TweetyNet-labeled canary songs.</title><p>Supporting <xref ref-type="fig" rid="fig7">Figure 7B</xref>. We plot the full probabilistic suffix trees created from 1,764 hand labeled canary songs (<bold>A</bold>) and from the prediction of a TweetyNet model trained on 19% of this bird’s song (<bold>B</bold>). The fluctuation in transition probabilities accumulates in long sequences and, in this example, increases the minimal sequence probability included in the PST. This increase prevented the inclusion of the ’N’ branch in the model built on TweetyNet’s prediction.</p></caption><graphic mime-subtype="tiff" mimetype="image" xlink:href="elife-63853.xml.media/elife-63853-fig7-figsupp2-v2.jpg"/></fig></fig-group><p>We asked if we could replicate findings about canary song syntax in a different strain of canaries using a TweetyNet model trained on a small manually annotated dataset. <xref ref-type="fig" rid="fig7">Figure 7</xref> demonstrates that annotations predicted by TweetyNet had sufficient accuracy on domestic canary song to extract its long-range order. In these figures, we set parameters of the PST estimation algorithm to derive the deepest syntax structure possible without overfitting, following the approach of <xref ref-type="bibr" rid="bib50">Markowitz et al., 2013</xref> that used about 600 hand-annotated songs of Waterslager canaries. In this example, using 2.2% of the data set, about 40 songs, to train a TweetyNet model and predict the rest of the data reveals the deep structures shown in <xref ref-type="fig" rid="fig7">Figure 7A</xref>, comparable to using 600 hand annotated songs of the same bird. With more training data, Tweetynet’s accuracy improves as does the statistical strength of the syntax model. In <xref ref-type="fig" rid="fig7">Figure 7B</xref> a TweetyNet model was trained on 19% of the data, about 340 songs, and predicted the rest of the data. The resulting syntax model can be elaborated to greater depth without overfitting. To validate this deeper model, we compared it with a PST fit to all 1764 songs from the same bird, manually annotated, and found that both PSTs were very similar (<xref ref-type="fig" rid="fig7">Figure 7B</xref>).</p><p>In sum, we find that TweetyNet, trained on a small sample of canary song, is accurate enough to automatically derive the deep structure that has formed the basis of recent studies (<xref ref-type="bibr" rid="bib50">Markowitz et al., 2013</xref>; <xref ref-type="bibr" rid="bib17">Cohen et al., 2020</xref>).</p></sec><sec id="s2-4-3"><title>Larger data sets of annotated canary song add details and limit the memory of the syntax structure</title><p>The increase in syntax detail, presented in <xref ref-type="fig" rid="fig7">Figure 7B</xref>, is possible because more rare nodes can be added to the PST without over-fitting the data. Formally, the PST precision increase in larger data sets is defined by the decrease in minimal node frequency allowed in the process of building PST models (<xref ref-type="fig" rid="fig8">Figure 8</xref>), as measured in model cross validation (see Materials and methods). In our data set, we find an almost linear relation between the number of songs and this measure of precision—close to a tenfold precision improvement.</p><fig id="fig8" position="float"><label>Figure 8.</label><caption><title>Using datasets more than five times larger than previously explored increases statistical power and the precision of syntax models.</title><p>(<bold>A</bold>) Ten-fold cross validation is used in selection of the minimal node probability for the PSTs (x-axis). Lines show the mean negative log-likelihood of test set data estimated by PSTs in 10 repetitions (methods). Curves are calculated for datasets that are sub sampled from about 5000 songs. Red dots show minimal values - the optimum for building the PSTs. (<bold>B</bold>) The decrease in optimal minimal node probability (y-axis, red dots in panel A) for increasing dataset sizes (x-axis) is plotted in gray lines for six birds. The average across animals is shown in black dots and line.</p><p><supplementary-material id="fig8sdata1"><label>Figure 8—source data 1.</label><caption><title>Data used to generate lines in <xref ref-type="fig" rid="fig8">Figure 8A</xref>.</title></caption><media mime-subtype="octet-stream" mimetype="application" xlink:href="elife-63853-fig8-data1-v2.csv"/></supplementary-material></p><p><supplementary-material id="fig8sdata2"><label>Figure 8—source data 2.</label><caption><title>Data used to generate dots in <xref ref-type="fig" rid="fig8">Figure 8A</xref>.</title></caption><media mime-subtype="octet-stream" mimetype="application" xlink:href="elife-63853-fig8-data2-v2.csv"/></supplementary-material></p><p><supplementary-material id="fig8sdata3"><label>Figure 8—source data 3.</label><caption><title>Data used to generate lines in <xref ref-type="fig" rid="fig8">Figure 8B</xref>.</title></caption><media mime-subtype="octet-stream" mimetype="application" xlink:href="elife-63853-fig8-data3-v2.csv"/></supplementary-material></p></caption><graphic mime-subtype="tiff" mimetype="image" xlink:href="elife-63853.xml.media/elife-63853-fig8-v2.jpg"/></fig><p>In <xref ref-type="fig" rid="fig7">Figure 7B</xref>, this increased precision allowed reliably adding longer branches to the PST to represent longer Markov chains (in comparison to <xref ref-type="fig" rid="fig7">Figure 7A</xref>). In this example, using a dataset three times larger revealed a 5-deep branch that initiates with the beginning of song (’1ABGN’), suggestive of a potential global time-in-song dependency of that transition. The PST in <xref ref-type="fig" rid="fig7">Figure 7B</xref> also has branches that did not ’grow’ compared to <xref ref-type="fig" rid="fig7">Figure 7A</xref> when more songs were analyzed (e.g. the ’B’, ’Q’, and ’R’ branches), indicating a potential cutoff of memory depth that is crucial in studying the neural mechanisms of song sequence generation.</p><p>The data sets used in <xref ref-type="fig" rid="fig7">Figures 7</xref> and <xref ref-type="fig" rid="fig8">8</xref>, are about 10 times larger than previous studies. To ascertain the accuracy of the syntax models, in creating the data sets we manually proofread annotations predicted by TweetyNet (see ‘Annotation of canary song’ in Materials and methods). Across five different human proof readers, we compared the time required to manually annotate canary song with the proof-reading time, and found that using TweetyNet saved 95–97.5% of the labor.</p><p>Taken as a whole, results in this section show that TweetyNet makes high-throughput automated annotation of behavioral experiments possible, greatly reducing labor while scaling up the amount of data that can be analyzed by orders of magnitude.</p></sec></sec><sec id="s2-5"><title>Very rare, hard-to-classify vocal behaviors can cause TweetyNet to introduce errors</title><p>Songbird species vary in the degree to which the elements of their song can be categorized into a set of discrete classes (<xref ref-type="bibr" rid="bib90">Thompson et al., 2012</xref>; <xref ref-type="bibr" rid="bib76">Sainburg et al., 2020</xref>). Even for species where expert annotators can readily define such a set, there will occasionally be periods in song where it is unclear how to classify syllables. Here, we provide examples of these rare cases to illustrate how even a well-trained TweetyNet model can introduce errors in annotation when the behavior itself cannot be cleanly categorized. The examples we present in <xref ref-type="fig" rid="fig9">Figure 9</xref> are from canaries, simply because their song can be so highly varied. As these examples illustrate, predictions of TweetyNet models are well-behaved when faced with rare variants, assigning high probability to the most relevant labels, not to completely unrelated classes of syllables. We emphasize that any standard supervised machine learning model that assigns only a single label to each segment will be vulnerable to introducing errors like these. Such errors raise questions about whether and when birdsong can be categorized into discrete syllable classes, questions that are brought back into focus by methods like ours that automate the process. As we will now discuss, we see several ways in which future work can address these questions.</p><fig id="fig9" position="float"><label>Figure 9.</label><caption><title>Rare variants of canary song introduce segmentation and annotation errors.</title><p>(<bold>A-E</bold>) Spectrograms on top of the time-aligned likelihood (gray scale) assigned by a well-trained TweetyNet model to each of the labels (y-axis, 30 syllable types and the tag ’<italic>Bg</italic>.’ for the background segments). Green and red vertical lines and numbers on top of the spectrograms mark the onset, offset, and labels predicted by the model. (<bold>A,B</bold>) Canary phrase transitions can contain a vocalization resembling the two flanking syllables fused together. A TweetyNet model trained to split this vocalization performed very well (<bold>A</bold>) but failed in a rare variant (<bold>B</bold>). The network output highlights a general property: TweetyNet assigned high likelihood to the same flanking syllable types and not to irrelevant syllables. (<bold>C</bold>) Syllables produced soft, weak, and acoustically imprecise at the onset of some canary phrases are mostly captured very well by TweetyNet but, on rare occasions, can be missed. In this example the model assigned high likelihood to the correct label but higher to the background. (<bold>D</bold>) Some human annotators, called 'splitters', define more syllable classes. Others, the 'lumpers', group acoustically-diverse vocalizations under the same label. TweetyNet models trained on acoustically-close classes assign high likelihood to both labels and, on rare occasions, flip between them. This example demonstrates that TweetyNet does not use the a-priori knowledge of syllable repeats hierarchically-forming canary phrases. (<bold>E</bold>) Canaries can simultaneously produce two notes from their two bronchi. This occurs in phrase transitions and the spectrogram of the resulting vocalization resembles an overlay of flanking syllables. While the network output shows high likelihood for both syllables the algorithm is forced to choose just one.</p></caption><graphic mime-subtype="tiff" mimetype="image" xlink:href="elife-63853.xml.media/elife-63853-fig9-v2.jpg"/></fig></sec></sec><sec id="s3" sec-type="discussion"><title>Discussion</title><p>Annotating birdsong at the level of syllables makes it possible to answer questions about the syntax governing this learned sequential behavior (<xref ref-type="bibr" rid="bib9">Berwick et al., 2011</xref>). Annotating syllables also makes it possible describe them in physical units like pitch and amplitude that researchers can directly link to muscular and neural activity (<xref ref-type="bibr" rid="bib81">Sober et al., 2008</xref>; <xref ref-type="bibr" rid="bib100">Wohlgemuth et al., 2010</xref>). However, for many species of songbirds, analyses at the syllable level still require labor-intensive, time-consuming manual annotation. There is a clear need for a method that can automate annotation across individuals and species, without requiring cleanly segmented audio, and without requiring researchers to carefully tune and validate multiple statistical models. To meet this need, we developed a neural network, TweetyNet (<xref ref-type="fig" rid="fig2">Figure 2</xref>): a single model trained end-to-end that learns directly from spectrograms how to automatically annotate each bird’s unique song. TweetyNet is deliberately designed to avoid dividing annotation into separate steps of segmentation and labeling, and it leverages the strengths of deep learning models to learn features for classification from the training data, instead of relying on pre-defined engineered features. We showed that our approach mitigates issues that result from the assumption that audio can be cleanly segmented into syllables (<xref ref-type="fig" rid="fig3">Figure 3</xref>). TweetyNet performs comparably to a carefully tuned Support Vector Machine model operating on pre-defined features extracted from manually cleaned, perfectly segmented audio (<xref ref-type="fig" rid="fig3">Figure 3</xref>). This result might suggest that an alternative to our approach would be to improve the audio segmentation step (e.g. with an alternative algorithm <xref ref-type="bibr" rid="bib88">Tchernichovski et al., 2000</xref>) and to use a state-of-the-art non-neural network model (such as XGBoost <xref ref-type="bibr" rid="bib15">Chen and Guestrin, 2016</xref>). Because such approaches lack the flexibility and expressiveness of deep learning models, we believe they will still require additional tuning our method avoids. For example, one could add classes for background noise to such models, but this would likely require additional steps to deal with class imbalance. Our model and problem formulation <italic>requires</italic> adding an additional ‘background’ class, which results in a more general solution (in much the same way that object detection models require and benefit from the addition of a background class <xref ref-type="bibr" rid="bib78">Scheirer et al., 2013</xref>).</p><p>A natural question raised by our results is how TweetyNet integrates the local acoustic structure within a few time bins of a spectrogram and the global structure of syllable sequences within spectrogram windows. To answer this, and gain a deeper understanding of our approach, we carried out experiments varying two key hyperparameters. The first was the size of spectrogram windows shown to the network, which determines how much context the network sees, and the second was the size of the hidden state in the recurrent layer, which determines the network’s capacity to integrate information across time steps. The experiments demonstrated that TweetyNet performance depends on well-chosen values for both hyperparameters (<xref ref-type="fig" rid="fig5">Figure 5</xref>). These empirical results support the idea that TweetyNet learns to recognize local features seen in syllables <italic>and</italic> learns to leverage sequential information across a spectrogram window.</p><p>In addition, we showed our approach performs well across multiple individuals from two different species of songbird, Bengalese finches and canaries (<xref ref-type="fig" rid="fig4">Figure 4</xref>). We demonstrated that using automated annotations produced by TweetyNet, we could replicate key findings from long-term behavioral experiments about the syntax of Bengalese finch (<xref ref-type="fig" rid="fig6">Figure 6</xref>) and canary song (<xref ref-type="fig" rid="fig7">Figures 7</xref> and <xref ref-type="fig" rid="fig8">8</xref>). Overall, these results show that our deep learning-based approach offers a robust, general method for automated annotation of birdsong at the syllable level.</p><sec id="s3-1"><title>Ideas and speculation</title><p>Our results open up avenues for future research in two directions: development of neural network algorithms, and applications of those algorithms, once developed. Regarding algorithm development, we suggest that future work should test whether networks can be trained to better learn to segment <italic>without</italic> post-processing. The experiments in <xref ref-type="fig" rid="fig5">Figure 5</xref> suggested that the post-processing we applied improves performance by correcting a small number of mislabeled time bins that cause a large increase in syllable error rate. From this, and from previous related work (<xref ref-type="bibr" rid="bib47">Lea et al., 2017</xref>), our sense is that a logical next step will be to incorporate the syllable error rate into the loss function, minimizing it directly. This would require some modifications to our approach, but may prove more effective than testing different network architectures.</p><p>Another important question for future work is: when it is appropriate to apply supervised learning algorithms to vocalizations, like ours and related object detection-based models (<xref ref-type="bibr" rid="bib16">Coffey et al., 2019</xref>; <xref ref-type="bibr" rid="bib24">Fonseca et al., 2021</xref>), and when should these algorithms be combined or even replaced with unsupervised algorithms. Recently developed unsupervised models learn helpful, often simpler, representations of birdsong and other animal vocalizations (<xref ref-type="bibr" rid="bib26">Goffinet et al., 2021</xref>; <xref ref-type="bibr" rid="bib75">Sainburg et al., 2019</xref>, <xref ref-type="bibr" rid="bib76">Sainburg et al., 2020</xref>). These advances and the advantages of methods like TweetyNet are not mutually exclusive, and can be integrated in different ways depending on the needs of researchers. For example, a TweetyNet model can serve as a front-end that finds and classifies segments, which are then passed to an unsupervised model. In addition to annotating syllables, we suggest future work consider two other levels of classification. The first would use TweetyNet to segment audio into just two classes: ‘vocalization’ and ‘non-vocalization’ periods. Treating segmentation as a binary classification problem in this way would make it possible to extend our approach to vocalizations that are not easily categorized into discrete labels: juvenile birdsong, bat calls (<xref ref-type="bibr" rid="bib71">Prat et al., 2017</xref>), and rodent USVs (<xref ref-type="bibr" rid="bib86">Tachibana et al., 2020</xref>), for example. Another level of classification consists of automatically annotating higher-level structures in song such as motifs, phrases (<xref ref-type="bibr" rid="bib50">Markowitz et al., 2013</xref>) or chunks (<xref ref-type="bibr" rid="bib87">Takahasi et al., 2010</xref>; <xref ref-type="bibr" rid="bib39">Kakishita et al., 2008</xref>). TweetyNet could annotate these explicitly defined higher level structures, that would then be passed to downstream unsupervised models designed for tasks like similarity measurement (e.g. <xref ref-type="bibr" rid="bib26">Goffinet et al., 2021</xref>; <xref ref-type="bibr" rid="bib75">Sainburg et al., 2019</xref>, <xref ref-type="bibr" rid="bib76">Sainburg et al., 2020</xref>). A second way that supervised and unsupervised algorithms could be combined would be to reverse the order, and use the unsupervised model as a front end. For example, models like those of <xref ref-type="bibr" rid="bib76">Sainburg et al., 2020</xref> could be used to automatically generate a candidate set of syllable classes from a relatively small dataset of cleanly segmented song. A researcher would visually inspect and validate these candidate classes, and once validated, use them with TweetyNet to bootstrap annotation of a much larger dataset.</p><p>Lastly, we speculate on use of trained TweetyNet models to measure uncertainty and similarity. These measures can be estimated using either the probabilities that TweetyNet produces as outputs, or with so-called ‘activations’ within layers of the network that are elicited by feeding inputs through it. The output probabilities can serve as a metric in syllable space. For example, when predicting new annotations, researchers could use output probabilities from TweetyNet to flag less confident predictions for subsequent human inspection and post-processing. As shown in <xref ref-type="fig" rid="fig9">Figure 9D</xref>, this approach can highlight rare song variants and may also help annotators identify edge cases where they have defined syllable classes that are too similar to each other. More generally, a researcher could use a TweetyNet model trained on a single adult’s song to obtain an estimate of any other song’s similarity to it, such as the adult’s offspring or even recordings of the same adult’s song as a juvenile. This could be done using the output probabilities, or activations within the network. Activations in trained TweetyNet models could also be used to assess the output of unsupervised models that generate vocalizations (<xref ref-type="bibr" rid="bib75">Sainburg et al., 2019</xref>), analogous to similar approaches in computer vision (<xref ref-type="bibr" rid="bib77">Salimans et al., 2016</xref>; <xref ref-type="bibr" rid="bib35">Heusel et al., 2017</xref>).</p></sec><sec id="s3-2"><title>Conclusion</title><p>The family of songbirds that learns by imitation consists of over 4500 species. Some of these singers, such as the canary, produce songs that are much too complex to be automatically annotated with existing methods, and for these complex singers little is known about the syntax structure and organization of song. The results we present suggest that our approach makes automated syllable-level annotation for many of these species possible. By sharing trained models, tutorials, data, and a library for benchmarking models, we also establish a strong baseline for work building upon ours. We are confident our method enables songbird researchers to automate annotation of very large datasets of entire bouts of song, required for analyses that address central questions of sensorimotor learning.</p></sec></sec><sec id="s4" sec-type="materials|methods"><title>Materials and methods</title><sec id="s4-1"><title>Data preparation</title><sec id="s4-1-1"><title>Segmenting audio files into syllables</title><sec id="s4-1-1-1"><title>Algorithm</title><p>For Bengalese finch song, we applied a widely-used simple algorithm to segment audio into syllables, as described in the Introduction and shown in <xref ref-type="fig" rid="fig1">Figure 1A</xref>. The first step of this algorithm consists of finding all periods (colored line segments, middle and bottom axes of <xref ref-type="fig" rid="fig1">Figure 1A</xref>) where the amplitude of song stays above some threshold (dashed horizontal line on bottom axes of <xref ref-type="fig" rid="fig1">Figure 1A</xref>). The resulting segments are further cleaned up using two more parameters. Any periods between segments that are shorter than a minimum silent interval are removed, merging any syllables neighboring those intervals, and then finally any remaining segments shorter than a minimum syllable duration are removed. We used the implementation of this audio segmenting algorithm in the evfuncs tool (<xref ref-type="bibr" rid="bib62">Nicholson, 2021c</xref>), that correctly replicates segmentation of the (<xref ref-type="bibr" rid="bib58">Nicholson et al., 2017</xref>) dataset, which was segmented using Matlab code developed for previous papers (see for example <xref ref-type="bibr" rid="bib92">Tumer and Brainard, 2007</xref>).</p></sec><sec id="s4-1-1-2"><title>Estimating segmenting parameters for canary song</title><p>As we state in the Introduction, the same algorithm cannot be applied to canary song. In spite of this, we apply the algorithm to canary song for results in <xref ref-type="fig" rid="fig3">Figure 3</xref>, to make very clear the issues that would results from relying on it. To estimate parameters that would produce the least amount of errors when segmenting canary song with this algorithm, we wrote a script that found the following for each bird’s song: (1) the median amplitude at all syllable onsets and offsets in the ground truth data with segmentation adjusted by human annotators, (2) the 10th percentile of syllable durations, (3) and the 0.1th percentile of silent intervals between syllables. We visually inspected the distributions of these values extracted from all segments, with our estimated segmenting parameters superimposed, to validate that we would not artificially create a very large number of errors by using the parameters we found with this script.</p></sec></sec></sec><sec id="s4-2"><title>Annotation of Bengalese finch song</title><p>Experiments in <xref ref-type="fig" rid="fig4">Figure 4</xref> included song from four birds in the ‘BirdsongRecognition’ dataset (<xref ref-type="bibr" rid="bib44">Koumura, 2016</xref>). The models in the original study were designed to annotate specific sequences within song, as described in their methods. The goal of our model is annotate entire bouts of song. To use that dataset in our experiments, we needed to fully annotate all bouts of song. If we did not label all syllables, then our model would be unfairly penalized when it correctly annotated syllables that were present in the original dataset, but were not annotated. Two of the authors (Cohen and Nicholson) fully annotated the song from four of the birds, employing the same GUI application used to annotate canary song. The vast majority of syllables that we labeled were the low-frequency, high-entropy ‘introduction’ notes that occur at the beginning of some song bouts in varying numbers, that are often ignored during analysis of zebra finch and Bengalese finch song. For the handful of cases where other syllables were not labeled, we chose from among the classes present in the already-annotated data to assign labels to these. In some very rare cases, we found syllables where the category was not clear, similar to the cases we describe for canary song in <xref ref-type="fig" rid="fig9">Figure 9</xref>. We chose to assign a separate class to these and remove song bouts containing theses classes from both the training and test sets. As we acknowledge in <xref ref-type="fig" rid="fig9">Figure 9</xref> Discussion, an inability to handle edge cases like these is a limitation of any standard supervised learning algorithm like ours, that operates at the level of syllables. We removed these cases so that we could be sure that benchmarking results accurately reflected how well the model performed on well-classified syllables.</p></sec><sec id="s4-3"><title>Annotation of canary song</title><sec id="s4-3-1"><title>Bootstrapping annotation with TweetyNet</title><p>In this manuscript, we used annotated domestic canary datasets an order of magnitude larger than previously published. To create these datasets we used TweetyNet followed by manual proofreading of its results. This process, described below, allowed ’bootstrapping’ TweetyNet’s performance. Song syllables were segmented and annotated in a semi-automatic process:</p><list list-type="bullet"><list-item><p>A set of 100 songs was manually segmented and annotated using a <ext-link ext-link-type="uri" xlink:href="https://github.com/yardencsGitHub/BirdSongBout/tree/master/helpers/GUI">GUI</ext-link> developed in-house (<xref ref-type="bibr" rid="bib18">Cohen, 2022</xref>). This set was chosen to include all potential syllable types as well as cage noises.</p></list-item><list-item><p>The manually labeled set was used to train <ext-link ext-link-type="uri" xlink:href="https://github.com/yardencsGitHub/tweetynet">TweetyNet</ext-link> (<xref ref-type="bibr" rid="bib64">Nicholson, 2022</xref>).</p></list-item><list-item><p>In both the training phase of TweetyNet and the prediction phase for new annotations, data is fed to TweetyNet in segments of 1 second and TweetyNet's output is the most likely label for each 2.7 ms time bin in the recording.</p></list-item><list-item><p>The trained algorithm annotated the rest of the data and its results were manually verified and corrected.</p></list-item></list></sec><sec id="s4-3-2"><title>Assuring the identity and separation of syllable classes</title><p>The manual steps in the pipeline described above can still miss rare syllable types or mislabel syllables into the wrong classes because of the human annotator’s mistake or because some annotators are more likely to lump or split syllable classes. To address this potential variability in canaries, where each bird can have as many as 50 different syllables, we made sure two annotators agree on the definition of the syllable classes. Then, to make sure that the syllable classes are well separated, all the spectrograms of every instance of every syllable, as segmented in the previous section, were zero-padded to the same duration. An outlier detection algorithm (<ext-link ext-link-type="uri" xlink:href="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html">IsolationForest</ext-link>) was used to flag and re-check potential mislabeled syllables or previously unidentified syllable classes.</p></sec></sec><sec id="s4-4"><title>Segmenting annotated phrases of Waterslager canaries</title><p>In <xref ref-type="fig" rid="fig1s2">Figure 1—figure supplement 2</xref> we include data from waterslager canaries, available from a previous project in the Gardner lab (<xref ref-type="bibr" rid="bib50">Markowitz et al., 2013</xref>). To include this data, we needed to break annotated phrase segments into syllable segments. Songs were previously segmented into phrases, trilled repetitions of syllables, and not to individual syllables. In each segmented phrase, we separated vocalization and noise fluctuations between vocalizations by fitting a two-state hidden Markov model with Gaussian emission functions to the acoustic signal. Putative syllable segments produced by this procedure were proofread and manually corrected using a <ext-link ext-link-type="uri" xlink:href="https://github.com/yardencsGitHub/BirdSongBout/tree/master/helpers/GUI">GUI</ext-link> developed in-house.</p></sec><sec id="s4-5"><title>Generating spectrograms</title><p>Spectrograms were generated from audio files using custom Numpy (Bengalese finch) or Matlab (canary) code. For Bengalese finches, the code we used to generate spectrograms is built into the <ext-link ext-link-type="uri" xlink:href="https://github.com/NickleDave/vak/blob/main/src/vak/spect.py">vak library</ext-link>. For canaries, the code we used to generate spectrograms can be found <ext-link ext-link-type="uri" xlink:href="https://github.com/jmarkow/zftftb/blob/master/zftftb_pretty_sonogram.m">here</ext-link> (<xref ref-type="bibr" rid="bib52">Markowitz, 2022b</xref>).</p><p>All spectrograms for song from a given species were created with the same parameters, such as the number of samples in the window for the Fast Fourier Transform (NFFT). For Bengalese finch song, we used <inline-formula><mml:math id="inf9"><mml:mrow><mml:mtext>NFFT</mml:mtext><mml:mo>=</mml:mo><mml:mn>512</mml:mn></mml:mrow></mml:math></inline-formula> with a step size of 64. For canaries we used <inline-formula><mml:math id="inf10"><mml:mrow><mml:mtext>NFFT</mml:mtext><mml:mo>=</mml:mo><mml:mn>1024</mml:mn></mml:mrow></mml:math></inline-formula> with a step size of 119. This produced spectrograms with a time bin size of 1ms for Bengalese finches, and 2.7ms for canaries.</p><p>We chose spectrogram parameters such that the duration of a time bin was just smaller than the shortest duration silent gaps between syllables. A larger bin size would have prevented our model from producing correct segments, whenever one of the true silent gaps was shorter than our bin size. In initial studies we experimented with even smaller bin sizes, but found that the network tended to over-segment. Future work could compensate for this oversegmenting by modifying the loss function as we point out in the Discussion.</p></sec><sec id="s4-6"><title>Neural network architecture</title><p>Here, we provide a detailed description of the TweetyNet neural network architecture. The architecture that we develop is most directly related to those that have been used for event detection in audio and video (<xref ref-type="bibr" rid="bib10">Böck and Schedl, 2012</xref>; <xref ref-type="bibr" rid="bib68">Parascandolo et al., 2016</xref>) and for phoneme classification and sequence labeling (<xref ref-type="bibr" rid="bib29">Graves and Schmidhuber, 2005</xref>; <xref ref-type="bibr" rid="bib31">Graves, 2012</xref>). It is also somewhat similar to deep network models for speech recognition, but a crucial difference is that state-of-the-art models in that area map directly from sequences of acoustic features to sequences of words (<xref ref-type="bibr" rid="bib30">Graves et al., 2006</xref>). The success of these state-of-the-art models is attributed to the fact that they learn this mapping from speech to text, <italic>avoiding</italic> the intermediate step of classifying each frame of audio, as has previously been shown (<xref ref-type="bibr" rid="bib31">Graves, 2012</xref>). In other words, they avoid the problem of classifying every frame that we set out to solve.</p><p>As described in the introduction, the network takes as input batches of windows from a spectrogram (c.f. top of <xref ref-type="fig" rid="fig2">Figure 2</xref>) and produces as output a tensor of size (<inline-formula><mml:math id="inf11"><mml:mi>m</mml:mi></mml:math></inline-formula> batches <inline-formula><mml:math id="inf12"><mml:mrow><mml:mi/><mml:mo>×</mml:mo><mml:mi>c</mml:mi></mml:mrow></mml:math></inline-formula> classes <inline-formula><mml:math id="inf13"><mml:mrow><mml:mi/><mml:mo>×</mml:mo><mml:mi>t</mml:mi></mml:mrow></mml:math></inline-formula> time bins). In <xref ref-type="fig" rid="fig10">Figure 10</xref> we show how networks blocks in that figure above relate to the shapes of tensors, and how those shapes change as the network maps input to output. We refer to this as we give specific parameters here. Note that we leave out the batch dimension in this description.</p><fig id="fig10" position="float"><label>Figure 10.</label><caption><title>TweetyNet architecture and tensor shapes resulting from each operation in the network.</title></caption><graphic mime-subtype="tiff" mimetype="image" xlink:href="elife-63853.xml.media/elife-63853-fig10-v2.jpg"/></fig><sec id="s4-6-1"><title>Convolutional blocks</title><p>The spectrogram window passes through two standard convolutional blocks, each of which consists of a convolutional layer and a max pooling layer. We use the standard term ‘block’ to refer to a layer that performs several operations on its inputs. For a convolutional block, the first operation is the convolution operation itself (’2D conv.’ in <xref ref-type="fig" rid="fig2">Figures 2</xref> and <xref ref-type="fig" rid="fig10">10</xref>), as it is defined in the deep learning literature. This cross-correlation-like operation (asterisk in <xref ref-type="fig" rid="fig2">Figures 2</xref> and <xref ref-type="fig" rid="fig10">10</xref>) between the spectrogram window and the learned filters (greyscale boxes in <xref ref-type="fig" rid="fig2">Figures 2</xref> and <xref ref-type="fig" rid="fig10">10</xref>) produces a set of feature maps. In both convolutional blocks, we use filters of size (5 × 5), with a stride of 1. The first block contains 32 filters and the second contains 64, as shown in <xref ref-type="fig" rid="fig10">Figure 10</xref>. We pad the feature maps so that they are the same size as the input. For a spectrogram window of size (176 time bins x 513 frequency bins), as we use for Bengalese finch song, this would produce an output of (176 × 513 x 32) from the first convolution, similar to what is shown in <xref ref-type="fig" rid="fig10">Figure 10</xref>.</p><p>A key advantage of convolutional layers is that they enable ‘weight sharing’, that is, the relatively small number of parameters in each filter is applied to the input via the convolution operation, instead of needing to have weights for each dimension of the input (e.g. each pixel in an image). We used a full weight sharing scheme, meaning that each filter was cross-correlated with the entire input window. Previous work has tested whether performance on tasks related to ours, such as speech recognition, might be improved by alternate weight-sharing schemes, for instance by restricting filters to specific frequency ranges within a spectrograms. One previous study did report a benefit for this limited weight-sharing scheme applied to automatic speech recognition (<xref ref-type="bibr" rid="bib1">Abdel-Hamid et al., 2014</xref>). However, this approach has not been widely adopted, and to the best of our knowledge, the common ‘full’ weight-sharing approach that we employ here is used by all state-of-the-art models for speech recognition for example, (<xref ref-type="bibr" rid="bib5">Amodei et al., 2016</xref>), as well as the event detection models that we based our architecture on (<xref ref-type="bibr" rid="bib68">Parascandolo et al., 2016</xref>; <xref ref-type="bibr" rid="bib47">Lea et al., 2017</xref>).</p></sec><sec id="s4-6-2"><title>Max pooling layer</title><p>In both convolutional blocks, we followed the convolutional layer by a max pooling layer. The max pooling operation (’Pooling’ in <xref ref-type="fig" rid="fig2">Figure 2</xref>) downsamples feature maps by sliding a window across the input (orange bin in <xref ref-type="fig" rid="fig2">Figure 2</xref>) and at each point keeping only the element with the maximum value within the window. We used a window size of (8 × 1) and a stride of (8, 1), with windows oriented so that the side of size one and the stride of size one were in the temporal dimension. Both the shape and stride were meant to avoid downsampling in the temporal dimension, under the assumption that it was important to retain this information. Applying a max pooling operation with these parameters to the first set of convolutional maps with size (176 × 513 x 32) produces an output tensor with size (176 × 64 x 32) as shown in as shown in <xref ref-type="fig" rid="fig10">Figure 10</xref>.</p><p>The max pooling operation is widely used in networks for related tasks like automatic speech recognition and audio event detection. Previous work has not found any benefit to alternative pooling operations such as stochastic pooling (<xref ref-type="bibr" rid="bib73">Sainath et al., 2013a</xref>) and alternatives have not been widely adopted. To our knowledge most related work also adopts our approach of not down-sampling in the temporal dimension, and studies have not found any advantage when using larger strides in the temporal dimension (<xref ref-type="bibr" rid="bib73">Sainath et al., 2013a</xref>).</p></sec><sec id="s4-6-3"><title>Recurrent layer</title><p>The output of the second convolutional block passes through a recurrent layer made up of LSTM units. Before passing it into the recurrent layer, we stack the feature maps: e.g. an output of (176 time bins x eight down-sampled frequency bins x 64 feature maps) becomes (176 time bins x 512 features) (indicated as &quot;Flatten&quot; in <xref ref-type="fig" rid="fig10">Figure 10</xref>). We specifically use a bidirectional LSTM, meaning the recurrent network processes the input in both the forward and backward direction. By default we set the size of the hidden state in the network equal to the ‘features’ dimension, and based on experiments in <xref ref-type="fig" rid="fig5">Figure 5</xref> this appears to be a reasonable default. The matrix of hidden states for all time steps become the output that we feed into the final layer. We adopt the standard practice of concatenating the hidden states from running the sequence in the forward and backward directions, doubling its size.</p></sec><sec id="s4-6-4"><title>Linear layer</title><p>The final layer in TweetyNet is a linear projection (<inline-formula><mml:math id="inf14"><mml:msub><mml:mover accent="true"><mml:mi>W</mml:mi><mml:mo></mml:mo></mml:mover><mml:mrow><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, purple matrix in <xref ref-type="fig" rid="fig2">Figure 2</xref>) of the recurrent layer’s output onto the different syllable classes, <inline-formula><mml:math id="inf15"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mn>1...</mml:mn><mml:mi>n</mml:mi></mml:mrow></mml:mstyle></mml:math></inline-formula>, resulting in a vector of <inline-formula><mml:math id="inf16"><mml:mi>n</mml:mi></mml:math></inline-formula> syllable-similarity scores for each spectrogram time bin <inline-formula><mml:math id="inf17"><mml:mi>t</mml:mi></mml:math></inline-formula>. The number of classes, <inline-formula><mml:math id="inf18"><mml:mi>n</mml:mi></mml:math></inline-formula>, is predetermined by the user. To segment syllables, the bin-wise syllable-similarity scores are first used to select a single syllable class per time bin by choosing the label with the highest syllable-similarity score. Since similarity scores can be normalized, this is akin to maximum a-posteriori (MAP) label selection. Then, the labeled time bins are used to separate continuous song segments from no-song segments and to annotate each song-segment with a single label using majority decision across time bins in that segment.</p></sec></sec><sec id="s4-7"><title>Training and benchmarking</title><sec id="s4-7-1"><title>Input data transformations</title><sec id="s4-7-1-1"><title>Windows</title><p>As stated above, the input to the network consists of spectrogram windows. We used a window size of 176 time bins for Bengalese finch song and 370 time bins for canary song, with the exception of experiments testing the impact of this hyperparameter in <xref ref-type="fig" rid="fig5">Figure 5</xref>.</p></sec><sec id="s4-7-1-2"><title>Vectors of labeled time bins</title><p>We formulate annotation as a supervised learning problem where each spectrogram window <inline-formula><mml:math id="inf19"><mml:mi>x</mml:mi></mml:math></inline-formula> has a corresponding vector of labeled time bins <inline-formula><mml:math id="inf20"><mml:mi>y</mml:mi></mml:math></inline-formula>, and our goal is to train the network <inline-formula><mml:math id="inf21"><mml:mi>f</mml:mi></mml:math></inline-formula> to correctly map a window to this vector of labeled time bins, <inline-formula><mml:math id="inf22"><mml:mrow><mml:mrow><mml:mi>f</mml:mi><mml:mo></mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo></mml:mo><mml:mi>y</mml:mi></mml:mrow></mml:math></inline-formula>. These vectors are generated dynamically by our software from annotations consisting of segment labels and their onset and offset times. Each element in the vector <inline-formula><mml:math id="inf23"><mml:mi>y</mml:mi></mml:math></inline-formula> contains an integer label <italic>c</italic><sub><italic>t</italic></sub> corresponding to the syllable class <inline-formula><mml:math id="inf24"><mml:mi>c</mml:mi></mml:math></inline-formula> in that time bin of the spectrogram window <italic>x</italic><sub><italic>t</italic></sub>. To this set of class labels, we add an additional class for the ‘background’ time bins that result naturally from gaps between annotated segments (’Bg.’ in <xref ref-type="fig" rid="fig2">Figure 2</xref>). This ‘background’ class will include brief quiet periods between syllables, as well as any other longer periods left unlabeled by the annotator that may contain non-song bird calls and background noises.</p></sec><sec id="s4-7-1-3"><title>Batches of (window, labeled time bin vector) pairs</title><p>During training, we randomly grab a batch of (window, labeled time bin vector) pairs <inline-formula><mml:math id="inf25"><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> from all possible windows in the dataset X. To achieve this, we developed a torch Dataset class that represents all such possible windows paired with the corresponding vector of labeled timebins <inline-formula><mml:math id="inf26"><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. The class tracks which windows the network has already seen during any epoch (iteration through the entire data set), ensuring that we avoid repeating the same windows during training, which could have encourage the network to memorize the training data.</p><p>The choice to randomly present windows also acts as a form of data augmentation that encourages the network to exhibit translation invariance. That is, because the network sees very similar sequences repeatedly, but those sequences are randomly shifted forward or backward slightly in time, it learns to correctly classify all time bins in a window regardless of how the window is presented to the model.</p></sec><sec id="s4-7-1-4"><title>Normalization</title><p>Normalization is a standard practice that improves optimization of machine learning models, but is not always necessary for neural networks.</p><p>For Bengalese finch song, we normalized spectrograms; more precisely we standardized by finding the mean μ and standard deviation <inline-formula><mml:math id="inf27"><mml:mi>σ</mml:mi></mml:math></inline-formula> of every frequency bin across all spectrograms in the training set, and then for every window <inline-formula><mml:math id="inf28"><mml:mi>x</mml:mi></mml:math></inline-formula> we subtracted off the mean and divided by the standard deviation: <inline-formula><mml:math id="inf29"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mtext>normalized</mml:mtext></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>x</mml:mi><mml:mo>-</mml:mo><mml:mi>μ</mml:mi></mml:mrow><mml:mi>σ</mml:mi></mml:mfrac></mml:mrow></mml:math></inline-formula>. Note that we achieved this with a SpectrogramScaler class built into the vak library rather than pre-processing with a script. For canary song we did not apply this normalization, and left the spectrograms as processed by the Matlab code referenced above. We did not systematically asses how normalization impacted performance.</p></sec><sec id="s4-7-1-5"><title>Spectrogram thresholding</title><p>We did not apply any thresholding to spectrograms, as is often done when visualizing them to increase contrast between sounds of interest and often quieter background noise. In preliminary experiments, we did test the effect of thresholding spectrograms, setting any value of the power spectrum less than the specified threshold to zero. However, we found that this led to a slight increase in error rates, and also made training more unstable. Our best guess for this effect of thresholding is that it produces abrupt, large magnitude changes in values in the spectrogram that may affect the gradient computed during training.</p></sec></sec></sec><sec id="s4-8"><title>Metrics</title><p>We define the metrics we use before describing our training methods, since our methods depend on these metrics. We measured performance of TweetyNet with two metrics.</p><sec id="s4-8-1"><title>Frame error</title><p>The first is the frame error, that simply measures for each acoustic frame (in our case, each time bin in a spectrogram) whether the predicted label matches the ground truth label. Hence the range of the frame error is between 0 and 1, that is can be stated as a percent, and gives an intuitive measure of a model’s overall performance. Previous work on supervised sequence labeling, including bidirectional-LSTM architectures similar to ours, has used this metric (<xref ref-type="bibr" rid="bib31">Graves, 2012</xref>; <xref ref-type="bibr" rid="bib29">Graves and Schmidhuber, 2005</xref>).</p></sec><sec id="s4-8-2"><title>Syllable error rate</title><p>The second metric we used is commonly called the word error rate in the speech recognition literature, and here we call it the syllable error rate. Because the syllable error rate is key to our results, we define it here, as shown in <xref ref-type="disp-formula" rid="equ1">Equation 1</xref>.<disp-formula id="equ1"><label>(1)</label><mml:math id="m1"><mml:mrow><mml:mtable columnalign="left left" columnspacing="1em" rowspacing="4pt"><mml:mtr><mml:mtd><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mtext>Syllable Error rate</mml:mtext></mml:mstyle></mml:mtd><mml:mtd><mml:mo>=</mml:mo><mml:mfrac><mml:mtext>Edit distance(reference sequence, predicted)</mml:mtext><mml:mtext>Length(reference sequence)</mml:mtext></mml:mfrac></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mstyle displaystyle="true" scriptlevel="0"/></mml:mtd><mml:mtd><mml:mo>=</mml:mo><mml:mfrac><mml:mtext>Substitutions + Insertions + Deletions</mml:mtext><mml:mtext>Length(reference sequence)</mml:mtext></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math></disp-formula></p><p>This metric is an edit distance, that counts the number of edits (insertions, deletions, and substitutions) needed to correct a predicted sequence so it matches the ground-truth (‘reference’) sequence. A common algorithm used to compute the number of edits is the Levenshtein distance that we use here. The edit distance is normalized by the length of the ground truth sequence, to make it possible to compare between sequences of different lengths.</p></sec></sec><sec id="s4-9"><title>Training</title><p>We trained all models using the Adam optimizer (<xref ref-type="bibr" rid="bib41">Kingma and Ba, 2014</xref>) with a learning rate of 0.001, and other hyperparameters set to the defaults in the torch library: <inline-formula><mml:math id="inf30"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>β</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>β</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mn>0.9</mml:mn><mml:mo>,</mml:mo><mml:mn>0.999</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mi>ϵ</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mi>e</mml:mi><mml:mo></mml:mo><mml:mn>08</mml:mn><mml:mo>,</mml:mo><mml:mrow><mml:mtext>weight decay</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mn>0.0</mml:mn><mml:mo>,</mml:mo><mml:mrow><mml:mtext>amsgrad</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mtext mathvariant="monospace">False</mml:mtext></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle></mml:math></inline-formula>.</p><p>For all experiments, we used a batch size of 8. We specify a number of epochs in our configuration files (an epoch is one iteration through the entire training dataset) but in practice we found that the number of windows is so large that we did not complete one entire epoch of training before network performance on the validation set met the criteria for early stopping, as described in the next paragraph.</p><sec id="s4-9-1"><title>Early stopping</title><p>To mitigate the tendency of neural networks to overfit, we employed early stopping. Error rates are measured on a validation set every val_step training steps, and training stops early if these error rates do not decrease after patience consecutive validation steps, where val_step and patience are option values declared by a user in configuration files for the vak library. We chose to specify validation in terms of a global step instead of epoch, because as just stated the size of the dataset of all possible windows is so large that training rarely completed an entire epoch. For each bird, the validation data set was kept separate from the training and test data sets. For Bengalese finches and canaries we used: <inline-formula><mml:math id="inf31"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext mathvariant="monospace">val\_step</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mn>250</mml:mn><mml:mo>,</mml:mo><mml:mrow><mml:mtext mathvariant="monospace">patience</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mn>4</mml:mn><mml:mo>,</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle></mml:math></inline-formula>.</p></sec></sec><sec id="s4-10"><title>Learning Curves</title><p>To estimate how much manually annotated training data is required to achieve a certain level of model performance, we generated learning curves that plot a metric such as frame error as a function of the amount of training data, as in the experiments shown in <xref ref-type="fig" rid="fig3">Figures 3</xref> and <xref ref-type="fig" rid="fig4">4</xref>. These experiments followed standard methods for benchmarking supervised machine learning algorithms, following good practices (<xref ref-type="bibr" rid="bib37">James et al., 2013</xref>), such as training multiple replicates on separate subsets of the training data. Producing these learning curves where the dataset size is measured in duration required extra steps not needed for other tasks such as image classification. For each individual bird, we fit networks with training sets of increasing size (duration in seconds) and then measured performance on a separate, fixed test set. For each training replicate, audio files were drawn at random from a fixed-size total training set until the target size (e.g. 60 s) was reached. If the total duration of the randomly drawn audio files extended beyond the target, they were clipped at that target duration while ensuring that all syllable classes were still present in the training set. After training completed, we computed metrics such as frame error and syllable error rate on the held-out test set for each bird. As stated, we chose to use a totally separate fixed-size set, instead of e.g. using the remainder of the training data set, or generating multiple test sets in a <inline-formula><mml:math id="inf32"><mml:mi>k</mml:mi></mml:math></inline-formula>-fold validation scheme. We did this for two reasons: first, because computing metrics on relatively large test sets can be computationally expensive, and second, because we wanted to be sure that any variance in our measures across training replicates could be attributed to the randomly drawn training set, and not to changes in the test set.</p><p>In the case of Bengalese finches, we used training sets with durations {30, 45, 75, 120, 180, 480, 600}, training 10 replicates for each duration, with subsets drawn randomly from a total training set of 900 seconds for each individual bird. The duration of the fixed test set for each bird was 400 s. For canaries, we used training sets of durations {120, 180, 240, 300, 360, 420, 480, 540, 600, 660}, training seven replicates for each duration, with subsets drawn randomly from a total training set of 25,000 s for each bird. The duration of the fixed test set for each bird was 5000 s. For the point estimate of the model’s asymptotic syllable error rate on canary song, we used a training set of 6000 s and a test set of 5000 s.</p><p>The method for generating learning curves as just described is built into the vak library and can be reproduced using the learncurve command in the terminal, along with the configuration files we shared.</p></sec><sec id="s4-11"><title>Comparison with a support vector machine model</title><p>In <xref ref-type="fig" rid="fig3">Figure 3</xref>, we compare performance of TweetyNet with a Support Vector Machine (SVM) model. We trained the model on a set of audio features first described in <xref ref-type="bibr" rid="bib85">Tachibana et al., 2014</xref>. Feature extraction code was translated to Python from original Matlab code kindly shared by the author. Based on previous work (<xref ref-type="bibr" rid="bib57">Nicholson, 2016</xref>), we used a Support Vector Machine with a radial basis function (RBF) kernel. To find good values for the kernel coefficient <inline-formula><mml:math id="inf33"><mml:mi>γ</mml:mi></mml:math></inline-formula> and the regularization parameter <inline-formula><mml:math id="inf34"><mml:mi>C</mml:mi></mml:math></inline-formula>, we performed halving random search across a range of values (<inline-formula><mml:math id="inf35"><mml:mi>γ</mml:mi></mml:math></inline-formula>=(1e-9, 1e-3), <inline-formula><mml:math id="inf36"><mml:mi>C</mml:mi></mml:math></inline-formula>=(60,1e10), log uniform distribution). In initial tests, we found that values of <inline-formula><mml:math id="inf37"><mml:mi>γ</mml:mi></mml:math></inline-formula> larger than 1.0 tended to produce pathological behavior where the model predicted one class for all features. We chose ranges for hyperparameter search that avoided this behavior. To carry out hyperparameter search we developed a pipeline in scikit-learn (<xref ref-type="bibr" rid="bib70">Pedregosa et al., 2011</xref>; <xref ref-type="bibr" rid="bib32">Grisel et al., 2020</xref>). In very rare cases, for two of the four birds, we needed to perform more than one run of the pipeline to find hyperparameters that did not cause it. To extract features and train models we adapted code from the hybrid-vocal-classifier library (<xref ref-type="bibr" rid="bib61">Nicholson, 2021b</xref>), which provides a high-level interface to scikit-learn, and our pipeline including hyperparameter tuning was similarly built with scikit-learn code. All SVM models were trained on the exact same train-test splits used for training TweetyNet, by using dataset files generated by vak. This meant that for each training set duration there were 10 replicates trained for Bengalese finch song and seven replicates for canary song.</p></sec><sec id="s4-12"><title>Statistics</title><p>To compare syllable error rates, we used the Wilcoxon paired signed-rank test, a non-parametric alternative to the T-test. We computed the test once for each training set duration, using paired samples: same number of training replicates that were each trained on a randomly drawn subset of training data, where the ‘factor’ within each pair of replicates was the model used, TweetyNet or the SVM. To test for homogeneity of variance, we used Levene’s test.</p><sec id="s4-12-1"><title>Additional analysis of model performance</title><sec id="s4-12-1-1"><title>Percentage of errors near boundaries</title><p>In ’Simple post-processing greatly reduces syllable error rates‘ we estimate the percentage of errors near boundaries. By ‘boundaries’ we mean the onset and offset times of syllables when they are manually annotated. A distribution of syllable durations computed from these onsets and offsets shows that boundaries are not static. There are two sources of this variation: naturally occurring motor variability in birdsong, and an additional noise component added by audio segmentation and human annotators. This variance in turn gives rise to frame errors, where the ground truth annotation and a trained TweetyNet model disagree about which of the time bins should be assigned the ’background’ label. These frame errors very close to boundaries are likely to have a much smaller impact on the syllable error rate than frame errors in the middle of syllables, because near the boundary they have no effect on the sequence of labels produced by segmenting the model output, and only a minor effect on the estimated onset and offset times. In the main text and in <xref ref-type="fig" rid="fig5s1">Figure 5—figure supplement 1</xref> we estimate the percent of all such frame errors occurring at these noisy syllable onset and offset boundaries. To do so, we computed for every onset and offset the number of frame errors within a fixed distance of two time bins that specifically involved disagreement between the ground truth annotation and the trained model on the the ’background’ class.</p></sec><sec id="s4-12-1-2"><title>Errors in rare sequences</title><p>For all sequence of Bengalese finch syllables a-b we examined all possibilities for the following syllable and identified the most frequent sequence, a-b-<bold>x</bold>. Then, among all sequences a-b-<bold>y</bold> that are at least four times less frequent than a-b-<bold>x</bold>, we measured the frame error during the syllable <bold>y</bold>. This detailed analysis showed that there is a very small effect on rare variants. Namely, even if the sequence a-b-<bold>y</bold> appears 100–1000 times less frequently than a-b-<bold>x</bold> it does not incur high error rates in most cases. We use two statistical tests to quantify this claim. First, we measure the Pearson correlation between the relative frequency of the rare event (prob(a-b-<bold>y</bold>) divided by prob(a-b-<bold>x</bold>)) and the frame error in the syllable ’y’ (the fraction of spectrogram time bins not labeled ’y’ within that segment). Second, we divide the rare events to the more rare and more common (relative frequency smaller or larger than 1/8) and measure the fraction of rare events exceeding the median error rate. We use the binomial z-test to compare the fraction and show that the difference is not significant.</p></sec></sec></sec><sec id="s4-13"><title>Model output as syllable likelihoods</title><p>In <xref ref-type="fig" rid="fig9">Figure 9</xref>, we present model outputs one step prior to assigning the most likely label to each spectrogram time bin. At that stage, one before the <italic>argmax(N</italic>) step in <xref ref-type="fig" rid="fig2">Figure 2</xref>, the model output for a given time bin <inline-formula><mml:math id="inf38"><mml:mi>t</mml:mi></mml:math></inline-formula> is a real-valued affinity <inline-formula><mml:math id="inf39"><mml:mrow><mml:mrow><mml:mi>a</mml:mi><mml:mo></mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:mi>s</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo></mml:mo><mml:mi class="ltx_font_mathcaligraphic"></mml:mi></mml:mrow></mml:math></inline-formula> of all predefined syllable classes <inline-formula><mml:math id="inf40"><mml:mi>s</mml:mi></mml:math></inline-formula>. In <xref ref-type="fig" rid="fig9">Figure 9</xref> we convert these numbers to likelihoods by subtracting the minimum value and normalizing separately for each time bin <inline-formula><mml:math id="inf41"><mml:mrow><mml:mrow><mml:mi>L</mml:mi><mml:mo></mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:mi>s</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mrow><mml:mi>a</mml:mi><mml:mo></mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:mi>s</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>-</mml:mo><mml:mrow><mml:mrow><mml:msub><mml:mi>min</mml:mi><mml:msup><mml:mi>s</mml:mi><mml:mo></mml:mo></mml:msup></mml:msub><mml:mo></mml:mo><mml:mi>a</mml:mi></mml:mrow><mml:mo></mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:msup><mml:mi>s</mml:mi><mml:mo></mml:mo></mml:msup><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mrow><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mo largeop="true" symmetric="true"></mml:mo><mml:mi>σ</mml:mi></mml:msub></mml:mstyle><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mrow><mml:mi>a</mml:mi><mml:mo></mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:mi>σ</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>-</mml:mo><mml:mrow><mml:mrow><mml:msub><mml:mi>min</mml:mi><mml:msup><mml:mi>s</mml:mi><mml:mo></mml:mo></mml:msup></mml:msub><mml:mo></mml:mo><mml:mi>a</mml:mi></mml:mrow><mml:mo></mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:msup><mml:mi>s</mml:mi><mml:mo></mml:mo></mml:msup><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mrow></mml:math></inline-formula>. This transformation was done for presentation only. Applying the commonly-used softmax transform (<inline-formula><mml:math id="inf42"><mml:mrow><mml:mi>x</mml:mi><mml:mo></mml:mo><mml:mfrac><mml:mrow><mml:mi>e</mml:mi><mml:mo></mml:mo><mml:mi>x</mml:mi><mml:mo></mml:mo><mml:mi>p</mml:mi><mml:mo></mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mo largeop="true" symmetric="true"></mml:mo><mml:mi>x</mml:mi></mml:msub></mml:mstyle><mml:mrow><mml:mi>e</mml:mi><mml:mo></mml:mo><mml:mi>x</mml:mi><mml:mo></mml:mo><mml:mi>p</mml:mi><mml:mo></mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mrow></mml:mfrac></mml:mrow></mml:math></inline-formula>) is equivalent since we only keep the maximal value.</p></sec><sec id="s4-14"><title>Analysis of behavioral data and predicted annotations</title><sec id="s4-14-1"><title>Bengalese finch branch points</title><p>We analyzed the Bengalese finch song in <xref ref-type="bibr" rid="bib58">Nicholson et al., 2017</xref> to determine whether we could replicate key findings about the stability of branch points from <xref ref-type="bibr" rid="bib96">Warren et al., 2012</xref> as described in the main text, and, if so, whether we could recover that results from annotations predicted by TweetyNet.</p><p>To analyze statistics of branch points in the Bengalese finch song from <xref ref-type="bibr" rid="bib58">Nicholson et al., 2017</xref>, we first identified candidate branch points by visual inspection of each birds’ annotated song. Then, for each day of a bird’s song, we counted all occurrences of transitions from one syllable class to another, that is bigrams. We placed these counts in a matrix where rows were the first syllable of the bigram (‘from’) and the columns where the second syllable (‘to’), and then performed a row-wise normalization to produce a first-order Markov transition matrix, where the elements are transition probabilities from one syllable class to another. We also applied a thresholding so that any elements in the matrix less than 0.002 were set to 0.</p><sec id="s4-14-1-1"><title>Statistical test</title><p>To test whether transition probabilities were stable across days, we used a permutation test, replicating the analysis of <xref ref-type="bibr" rid="bib96">Warren et al., 2012</xref>. We took all occurrences of a transition point across two days, and then for each permutation, swapped the label for which day it belong to, and then computed the transition probabilities for the permuted days. Using 1000 permutations, we generated a distribution of diffrences and then asked whether the observed difference was larger than this bootstrapped distribution.</p></sec><sec id="s4-14-1-2"><title>Analysis of predicted annotations</title><p>Before testing whether we could recover the result that branch points were stable from annotations predicted by TweetyNet, we measured the syllable error rate of the trained models that we would use to predict annotations. For 1 day of song from one bird (or60yw70), we realized that the manually-annotated set of songs was even smaller (200 s) than the test sets we used in the benchmarking section. We removed this day because it is not really an ‘entire day’ of song.</p></sec></sec></sec><sec id="s4-15"><title>Canary syntax model</title><sec id="s4-15-1"><title>Shared template dependence on number of syllables in song</title><p>In each bird, we define an upper bound for repeating parts of songs using pairwise comparisons. For each song we examined all other songs with equal or larger number of syllables and found the largest shared string of consecutive syllables. The fraction of shared syllables is the ratio between the number of shared sequence and the number of syllables in the first, shorter, song. Then, we bin songs by syllable counts (bin size is 10 syllables) and calculate the mean and standard deviation across all pairwise comparisons. Results are shown in <xref ref-type="fig" rid="fig1s2">Figure 1—figure supplement 2</xref>.</p></sec><sec id="s4-15-2"><title>Probabilistic suffix trees</title><p>For each canary phrase type, we describe the dependency of the following transition on previous phrases with a probabilistic suffix tree. This method was described in a previous publication from our lab (<xref ref-type="bibr" rid="bib50">Markowitz et al., 2013</xref>, <xref ref-type="bibr" rid="bib51">Markowitz, 2022a</xref>). Briefly, the tree is a directed graph in which each phrase type is a root node representing the first order (Markov) transition probabilities to downstream phrases, including the end of song. The pie charts in <xref ref-type="fig" rid="fig7">Figure 7</xref>, <xref ref-type="fig" rid="fig7s1">Figure 7—figure supplement 1</xref>, and <xref ref-type="fig" rid="fig7s2">Figure 7—figure supplement 2</xref> show such probabilities. Upstream nodes represent higher order Markov chains that are added sequentially if they significantly add information about the transition.</p></sec><sec id="s4-15-3"><title>Model cross validation to determine minimal node frequency</title><p>To prevent overfitting, nodes in the probabilistic suffix trees are added only if they appear more often than a threshold frequency, <inline-formula><mml:math id="inf43"><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mo></mml:mo><mml:mi>i</mml:mi><mml:mo></mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. To determine <inline-formula><mml:math id="inf44"><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mo></mml:mo><mml:mi>i</mml:mi><mml:mo></mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> we replicate the procedure in <xref ref-type="bibr" rid="bib50">Markowitz et al., 2013</xref> and carry a 10-fold model cross validation procedure. In this procedure the dataset is randomly divided into a training set, containing 90 percent of songs, and a test set, containing 10 percent of songs. A PST is created using the training set and used to calculate the negative log likelihood of the test set. This procedure is repeated 10 times for each value of <inline-formula><mml:math id="inf45"><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mo></mml:mo><mml:mi>i</mml:mi><mml:mo></mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, the x-axis in <xref ref-type="fig" rid="fig8">Figure 8a</xref>. For data sets of different sizes (curves in <xref ref-type="fig" rid="fig8">Figure 8a</xref> x-axis in <xref ref-type="fig" rid="fig8">Figure 8b</xref>) the mean negative log-likelihood across the 10 cross validation subsets and across 10 data sets, y-axis in <xref ref-type="fig" rid="fig8">Figure 8a</xref>, is then used to find the optimal value of <inline-formula><mml:math id="inf46"><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mo></mml:mo><mml:mi>i</mml:mi><mml:mo></mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> - the minimum negative log-likelihood that corresponds to the highest precision without over-fitting the training set. All PSTs in <xref ref-type="fig" rid="fig7">Figure 7</xref>, <xref ref-type="fig" rid="fig7s1">Figure 7—figure supplement 1</xref>, and <xref ref-type="fig" rid="fig7s2">Figure 7—figure supplement 2</xref> are created using the cross-validated <inline-formula><mml:math id="inf47"><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mo></mml:mo><mml:mi>i</mml:mi><mml:mo></mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>.</p></sec></sec><sec id="s4-16"><title>Code Availability</title><p>The code implementing the TweetyNet architecture, and code to reproduce experiments and figures in this paper, are available <ext-link ext-link-type="uri" xlink:href="https://github.com/yardencsGitHub/tweetynet">here</ext-link> (version 0.7.1, 10.5281/zenodo.5823556).</p><p>To aid with reproducibility of our experiments, and to make TweetyNet more accessible to researchers studying birdsong and other animal vocalizations, we developed a software library, vak (<xref ref-type="bibr" rid="bib59">Nicholson and Cohen, 2021</xref>), available <ext-link ext-link-type="uri" xlink:href="https://github.com/NickleDave/vak">here</ext-link>.</p><p>We also developed Python tools to work with the datasets and their annotation: (<xref ref-type="bibr" rid="bib60">Nicholson, 2021a</xref>; <xref ref-type="bibr" rid="bib62">Nicholson, 2021c</xref>; <xref ref-type="bibr" rid="bib63">Nicholson, 2021d</xref>).</p><p>All software was implemented in Python, leveraging the following open-source scientific libraries, frameworks, and tools: attrs (<xref ref-type="bibr" rid="bib79">Schlawack, 2020</xref>), dask (<xref ref-type="bibr" rid="bib21">Dask Development Team, 2016</xref>), jupyter (<xref ref-type="bibr" rid="bib42">Kluyver et al., 2016</xref>), matplotlib (<xref ref-type="bibr" rid="bib36">Hunter, 2007</xref>; <xref ref-type="bibr" rid="bib14">Caswell et al., 2020</xref>), numpy (<xref ref-type="bibr" rid="bib93">van der Walt et al., 2011</xref>; <xref ref-type="bibr" rid="bib33">Harris et al., 2020</xref>), pandas (<xref ref-type="bibr" rid="bib67">pandas development team, 2020</xref>), scikit-learn (<xref ref-type="bibr" rid="bib70">Pedregosa et al., 2011</xref>; <xref ref-type="bibr" rid="bib32">Grisel et al., 2020</xref>), scipy (<xref ref-type="bibr" rid="bib95">Virtanen et al., 2020</xref>), torch (<xref ref-type="bibr" rid="bib69">Paszke et al., 2017</xref>), torchvision (<xref ref-type="bibr" rid="bib49">Marcel and Rodriguez, 2010</xref>), seaborn (<xref ref-type="bibr" rid="bib97">Waskom et al., 2020</xref>; <xref ref-type="bibr" rid="bib98">Waskom, 2021</xref>), and tqdm (<xref ref-type="bibr" rid="bib19">da Costa-Luis, 2019</xref>).</p></sec><sec id="s4-17"><title>Data Collection</title><sec id="s4-17-1"><title>Use of existing datasets</title><p>Bengalese finch song is from two publicly-available repositories. Results in <xref ref-type="fig" rid="fig3">Figures 3</xref>, <xref ref-type="fig" rid="fig4">4</xref> and <xref ref-type="fig" rid="fig6">6</xref> all make use of &quot;<ext-link ext-link-type="uri" xlink:href="https://figshare.com/articles/Bengalese_Finch_song_repository/4805749">Bengalese finch Song Repository</ext-link>&quot; (<xref ref-type="bibr" rid="bib58">Nicholson et al., 2017</xref>). For experiments in <xref ref-type="fig" rid="fig4">Figure 4</xref> we added song from four Bengalese finches in an additional <ext-link ext-link-type="uri" xlink:href="https://figshare.com/articles/BirdsongRecognition/3470165">dataset</ext-link>, (<xref ref-type="bibr" rid="bib44">Koumura, 2016</xref>), and accompanied the paper (<xref ref-type="bibr" rid="bib45">Koumura and Okanoya, 2016</xref>). Please see ‘Annotation of Bengalese finch song’ for a description of how we annotated that data. Supplementary figures with descriptive statistics of song also use datasets of Waterslager canary songs (<xref ref-type="bibr" rid="bib50">Markowitz et al., 2013</xref>), Bengalese finch songs (<xref ref-type="bibr" rid="bib45">Koumura and Okanoya, 2016</xref>) and Zebra finch songs (<xref ref-type="bibr" rid="bib66">Otchy et al., 2015</xref>) generously shared by those authors.</p></sec><sec id="s4-17-2"><title>Domestic canary song screening</title><p>Birds were individually housed in soundproof boxes and recorded for 3–5 days (Audio-Technica AT831B Lavalier Condenser Microphone, M-Audio Octane amplifiers, HDSPe RayDAT sound card and VOS games' Boom Recorder software on a Mac Pro desktop computer). In-house software was used to detect and save only sound segments that contained vocalizations. These recordings were used to select subjects that are copious singers (<inline-formula><mml:math id="inf48"><mml:mrow><mml:mi/><mml:mo></mml:mo><mml:mn>50</mml:mn></mml:mrow></mml:math></inline-formula> songs per day) and produce at least 10 different syllable types.</p></sec><sec id="s4-17-3"><title>Domestic canary audio recording</title><p>All data used in this manuscript was acquired between late April and early May 2018 - a period during which canaries perform their mating season songs. Birds were individually housed in soundproof boxes and recorded for 7–10 days (Audio-Technica AT831B Lavalier Condenser Microphone, M-Audio M-track amplifiers, and VOS games' Boom Recorder software on a Mac Pro desktop computer). In-house software was used to detect and save only sound segments that contained vocalizations. Separate songs were defined by silence gaps exceeding 1 second.</p></sec></sec></sec></body><back><sec id="s5" sec-type="additional-information"><title>Additional information</title><fn-group content-type="competing-interest"><title>Competing interests</title><fn fn-type="COI-statement" id="conf1"><p>No competing interests declared</p></fn><fn fn-type="COI-statement" id="conf2"><p>No competing interests declared</p></fn></fn-group><fn-group content-type="author-contribution"><title>Author contributions</title><fn fn-type="con" id="con1"><p>Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Resources, Software, Supervision, Visualization, Writing – original draft, Writing – review and editing</p></fn><fn fn-type="con" id="con2"><p>Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Resources, Software, Validation, Visualization, Writing – original draft, Writing – review and editing</p></fn><fn fn-type="con" id="con3"><p>Data curation</p></fn><fn fn-type="con" id="con4"><p>Data curation</p></fn><fn fn-type="con" id="con5"><p>Data curation</p></fn><fn fn-type="con" id="con6"><p>Funding acquisition, Project administration, Resources, Writing – original draft, Writing – review and editing</p></fn></fn-group><fn-group content-type="ethics-information"><title>Ethics</title><fn fn-type="other"><p>All procedures were approved by the Institutional Animal Care and Use Committees of Boston University (protocol numbers 14-028 and 14-029). Song data were collected from adult male canaries (n = 5). Canaries were individually housed for the entire duration of the experiment and kept on a light-dark cycle matching the daylight cycle in Boston (42.3601 N). The birds were not used in any other experiments.</p></fn></fn-group></sec><sec id="s6" sec-type="supplementary-material"><title>Additional files</title><supplementary-material id="transrepform"><label>Transparent reporting form</label><media mime-subtype="pdf" mimetype="application" xlink:href="elife-63853-transrepform1-v2.pdf"/></supplementary-material></sec><sec id="s7" sec-type="data-availability"><title>Data availability</title><p>Datasets of annotated Bengalese finch song are available <ext-link ext-link-type="uri" xlink:href="https://figshare.com/articles/Bengalese_Finch_song_repository/4805749">here</ext-link> and <ext-link ext-link-type="uri" xlink:href="https://figshare.com/articles/BirdsongRecognition/3470165">here</ext-link>. Datasets of annotated canary song are available <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5061/dryad.xgxd254f4">here</ext-link>. Model checkpoints, logs, and source data files are available <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.5061/dryad.gtht76hk4">here</ext-link>. Source data files for figure are in the repository associated with the paper <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.5823556">here</ext-link> (version 0.7.1).</p><p>The following datasets were generated:</p><p><element-citation id="dataset1" publication-type="data" specific-use="isSupplementedBy"><person-group person-group-type="author"><name><surname>Cohen</surname><given-names>Y</given-names></name></person-group><year iso-8601-date="2022">2022</year><data-title>Song recordings and annotation files of 3 canaries used to evaluate training of TweetyNet models for birdsong segmentation and annotation</data-title><source>Dryad Digital Repository</source><pub-id pub-id-type="doi">10.5061/dryad.xgxd254f4</pub-id></element-citation></p><p><element-citation id="dataset2" publication-type="data" specific-use="isSupplementedBy"><person-group person-group-type="author"><name><surname>Nicholson</surname><given-names>D</given-names></name><name><surname>Cohen</surname><given-names>Y</given-names></name></person-group><year iso-8601-date="2022">2022</year><data-title>Model checkpoints, logs, and source data files</data-title><source>Dryad Digital Repository</source><pub-id pub-id-type="doi">10.5061/dryad.gtht76hk4</pub-id></element-citation></p><p><element-citation id="dataset3" publication-type="data" specific-use="isSupplementedBy"><person-group person-group-type="author"><name><surname>Nicholson</surname><given-names>DA</given-names></name></person-group><year iso-8601-date="2022">2022</year><data-title>TweetyNet</data-title><source>Zenodo</source><pub-id pub-id-type="doi">10.5281/zenodo.5823556</pub-id></element-citation></p><p>The following previously published datasets were used:</p><p><element-citation id="dataset4" publication-type="data" specific-use="references"><person-group person-group-type="author"><name><surname>Nicholson</surname><given-names>D</given-names></name><name><surname>Queen</surname><given-names>JE</given-names></name><name><surname>Sober</surname><given-names>JS</given-names></name></person-group><year iso-8601-date="2017">2017</year><data-title>Bengalese Finch song repository</data-title><source>figshare</source><pub-id pub-id-type="doi">10.6084/m9.figshare.4805749.v6</pub-id></element-citation></p><p><element-citation id="dataset5" publication-type="data" specific-use="references"><person-group person-group-type="author"><name><surname>Koumura</surname><given-names>T</given-names></name></person-group><year iso-8601-date="2016">2016</year><data-title>BirdsongRecognition</data-title><source>figshare</source><pub-id pub-id-type="doi">10.6084/m9.figshare.3470165.v1</pub-id></element-citation></p></sec><ack id="ack"><title>Acknowledgements</title><p>This study was supported by NIH grants R01NS104925, R24NS098536, and R01NS118424 (TJG) We thank J Markowitz and TM Otchy for sharing song datasets, and Nvidia Corporation for a technology grant (YC and Samuel J Sober lab). We also thank the Sober lab for providing compute resources and feedback on early versions of this work.</p></ack><ref-list><title>References</title><ref id="bib1"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Abdel-Hamid</surname><given-names>O</given-names></name><name><surname>Mohamed</surname><given-names>A</given-names></name><name><surname>Jiang</surname><given-names>H</given-names></name><name><surname>Deng</surname><given-names>L</given-names></name><name><surname>Penn</surname><given-names>G</given-names></name><name><surname>Yu</surname><given-names>D</given-names></name></person-group><year iso-8601-date="2014">2014</year><article-title>Convolutional Neural Networks for Speech Recognition</article-title><source>IEEE/ACM Transactions on Audio, Speech, and Language Processing</source><volume>22</volume><fpage>1533</fpage><lpage>1545</lpage><pub-id pub-id-type="doi">10.1109/TASLP.2014.2339736</pub-id></element-citation></ref><ref id="bib2"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Alliende</surname><given-names>J</given-names></name><name><surname>Lehongre</surname><given-names>K</given-names></name><name><surname>Del Negro</surname><given-names>C</given-names></name></person-group><year iso-8601-date="2013">2013</year><article-title>A species-specific view of song representation in a sensorimotor nucleus</article-title><source>Journal of Physiology, Paris</source><volume>107</volume><fpage>193</fpage><lpage>202</lpage><pub-id pub-id-type="doi">10.1016/j.jphysparis.2012.08.004</pub-id><pub-id pub-id-type="pmid">22960663</pub-id></element-citation></ref><ref id="bib3"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Alonso</surname><given-names>LM</given-names></name><name><surname>Alliende</surname><given-names>JA</given-names></name><name><surname>Goller</surname><given-names>F</given-names></name><name><surname>Mindlin</surname><given-names>GB</given-names></name></person-group><year iso-8601-date="2009">2009</year><article-title>Low-dimensional dynamical model for the diversity of pressure patterns used in canary song</article-title><source>Physical Review. E, Statistical, Nonlinear, and Soft Matter Physics</source><volume>79</volume><elocation-id>041929</elocation-id><pub-id pub-id-type="doi">10.1103/PhysRevE.79.041929</pub-id><pub-id pub-id-type="pmid">19518278</pub-id></element-citation></ref><ref id="bib4"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Alvarez-Buylla</surname><given-names>A</given-names></name><name><surname>Kirn</surname><given-names>JR</given-names></name><name><surname>Nottebohm</surname><given-names>F</given-names></name></person-group><year iso-8601-date="1990">1990</year><article-title>Birth of projection neurons in adult avian brain may be related to perceptual or motor learning</article-title><source>Science</source><volume>249</volume><fpage>1444</fpage><lpage>1446</lpage><pub-id pub-id-type="doi">10.1126/science.1698312</pub-id><pub-id pub-id-type="pmid">1698312</pub-id></element-citation></ref><ref id="bib5"><element-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Amodei</surname><given-names>D</given-names></name><name><surname>Ananthanarayanan</surname><given-names>S</given-names></name><name><surname>Anubhai</surname><given-names>R</given-names></name><name><surname>Bai</surname><given-names>J</given-names></name><name><surname>Battenberg</surname><given-names>E</given-names></name><name><surname>Case</surname><given-names>C</given-names></name><name><surname>Casper</surname><given-names>J</given-names></name><name><surname>Catanzaro</surname><given-names>B</given-names></name><name><surname>Cheng</surname><given-names>Q</given-names></name><name><surname>Chen</surname><given-names>G</given-names></name></person-group><year iso-8601-date="2016">2016</year><conf-name>Deep speech 2: End-to-end speech recognition in english and mandarin</conf-name><article-title>In International Conference on Machine Learning PMLR</article-title><fpage>173</fpage><lpage>182</lpage></element-citation></ref><ref id="bib6"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Anderson</surname><given-names>SE</given-names></name><name><surname>Dave</surname><given-names>AS</given-names></name><name><surname>Margoliash</surname><given-names>D</given-names></name></person-group><year iso-8601-date="1996">1996</year><article-title>Template-based automatic recognition of birdsong syllables from continuous recordings</article-title><source>The Journal of the Acoustical Society of America</source><volume>100</volume><fpage>1209</fpage><lpage>1219</lpage><pub-id pub-id-type="doi">10.1121/1.415968</pub-id><pub-id pub-id-type="pmid">8759970</pub-id></element-citation></ref><ref id="bib7"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Appeltants</surname><given-names>D</given-names></name><name><surname>Gentner</surname><given-names>TQ</given-names></name><name><surname>Hulse</surname><given-names>SH</given-names></name><name><surname>Balthazart</surname><given-names>J</given-names></name><name><surname>Ball</surname><given-names>GF</given-names></name></person-group><year iso-8601-date="2005">2005</year><article-title>The effect of auditory distractors on song discrimination in male canaries (Serinus canaria)</article-title><source>Behavioural Processes</source><volume>69</volume><fpage>331</fpage><lpage>341</lpage><pub-id pub-id-type="doi">10.1016/j.beproc.2005.01.010</pub-id><pub-id pub-id-type="pmid">15896531</pub-id></element-citation></ref><ref id="bib8"><element-citation publication-type="software"><person-group person-group-type="author"><collab>Audacity Team</collab></person-group><year iso-8601-date="2019">2019</year><data-title>Audacity</data-title><publisher-name>Audacity</publisher-name><ext-link ext-link-type="uri" xlink:href="https://www.audacityteam.org/2019/?p=t">https://www.audacityteam.org/2019/?p=t</ext-link></element-citation></ref><ref id="bib9"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Berwick</surname><given-names>RC</given-names></name><name><surname>Okanoya</surname><given-names>K</given-names></name><name><surname>Beckers</surname><given-names>GJL</given-names></name><name><surname>Bolhuis</surname><given-names>JJ</given-names></name></person-group><year iso-8601-date="2011">2011</year><article-title>Songs to syntax: the linguistics of birdsong</article-title><source>Trends in Cognitive Sciences</source><volume>15</volume><fpage>113</fpage><lpage>121</lpage><pub-id pub-id-type="doi">10.1016/j.tics.2011.01.002</pub-id><pub-id pub-id-type="pmid">21296608</pub-id></element-citation></ref><ref id="bib10"><element-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Böck</surname><given-names>S</given-names></name><name><surname>Schedl</surname><given-names>M</given-names></name></person-group><year iso-8601-date="2012">2012</year><conf-name>Polyphonic Piano Note Transcription with Recurrent Neural Networks</conf-name><article-title>IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</article-title><fpage>121</fpage><lpage>124</lpage></element-citation></ref><ref id="bib11"><element-citation publication-type="software"><person-group person-group-type="author"><name><surname>Boersma</surname><given-names>P</given-names></name><name><surname>Weenink</surname><given-names>D</given-names></name></person-group><year iso-8601-date="2021">2021</year><data-title>Doing Phonetics by Computer</data-title><version designator="5.3.51">5.3.51</version><source>Praat</source><ext-link ext-link-type="uri" xlink:href="https://www.fon.hum.uva.nl/praat/">https://www.fon.hum.uva.nl/praat/</ext-link></element-citation></ref><ref id="bib12"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Brainard</surname><given-names>MS</given-names></name><name><surname>Doupe</surname><given-names>AJ</given-names></name></person-group><year iso-8601-date="2002">2002</year><article-title>What songbirds teach us about learning</article-title><source>Nature</source><volume>417</volume><fpage>351</fpage><lpage>358</lpage><pub-id pub-id-type="doi">10.1038/417351a</pub-id><pub-id pub-id-type="pmid">12015616</pub-id></element-citation></ref><ref id="bib13"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Burkett</surname><given-names>ZD</given-names></name><name><surname>Day</surname><given-names>NF</given-names></name><name><surname>Peñagarikano</surname><given-names>O</given-names></name><name><surname>Geschwind</surname><given-names>DH</given-names></name><name><surname>White</surname><given-names>SA</given-names></name></person-group><year iso-8601-date="2015">2015</year><article-title>VoICE: A semi-automated pipeline for standardizing vocal analysis across models</article-title><source>Scientific Reports</source><volume>5</volume><elocation-id>10237</elocation-id><pub-id pub-id-type="doi">10.1038/srep10237</pub-id><pub-id pub-id-type="pmid">26018425</pub-id></element-citation></ref><ref id="bib14"><element-citation publication-type="software"><person-group person-group-type="author"><name><surname>Caswell</surname><given-names>TA</given-names></name><name><surname>Droettboom</surname><given-names>M</given-names></name><name><surname>Lee</surname><given-names>A</given-names></name><name><surname>de Andrade</surname><given-names>ES</given-names></name></person-group><year iso-8601-date="2020">2020</year><data-title>Matplotlib/Matplotlib: REL</data-title><version designator="V3.3.2">V3.3.2</version><source>Zenodo</source><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo/matplotlib.org/">https://doi.org/10.5281/zenodo/matplotlib.org/</ext-link></element-citation></ref><ref id="bib15"><element-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Chen</surname><given-names>T</given-names></name><name><surname>Guestrin</surname><given-names>C</given-names></name></person-group><year iso-8601-date="2016">2016</year><conf-name>XGBoost: A scalable tree boosting system</conf-name><article-title>In Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, KDD ’16</article-title><fpage>785</fpage><lpage>794</lpage><pub-id pub-id-type="doi">10.1145/2939672.2939785</pub-id></element-citation></ref><ref id="bib16"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Coffey</surname><given-names>KR</given-names></name><name><surname>Marx</surname><given-names>RG</given-names></name><name><surname>Neumaier</surname><given-names>JF</given-names></name></person-group><year iso-8601-date="2019">2019</year><article-title>DeepSqueak: a deep learning-based system for detection and analysis of ultrasonic vocalizations</article-title><source>Neuropsychopharmacology : Official Publication of the American College of Neuropsychopharmacology</source><volume>44</volume><fpage>859</fpage><lpage>868</lpage><pub-id pub-id-type="doi">10.1038/s41386-018-0303-6</pub-id><pub-id pub-id-type="pmid">30610191</pub-id></element-citation></ref><ref id="bib17"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cohen</surname><given-names>Y</given-names></name><name><surname>Shen</surname><given-names>J</given-names></name><name><surname>Semu</surname><given-names>D</given-names></name><name><surname>Leman</surname><given-names>DP</given-names></name><name><surname>Liberti</surname><given-names>WA</given-names></name><name><surname>Perkins</surname><given-names>LN</given-names></name><name><surname>Liberti</surname><given-names>DC</given-names></name><name><surname>Kotton</surname><given-names>DN</given-names></name><name><surname>Gardner</surname><given-names>TJ</given-names></name></person-group><year iso-8601-date="2020">2020</year><article-title>Hidden neural states underlie canary song syntax</article-title><source>Nature</source><volume>582</volume><fpage>539</fpage><lpage>544</lpage><pub-id pub-id-type="doi">10.1038/s41586-020-2397-3</pub-id><pub-id pub-id-type="pmid">32555461</pub-id></element-citation></ref><ref id="bib18"><element-citation publication-type="software"><person-group person-group-type="author"><name><surname>Cohen</surname><given-names>Y</given-names></name></person-group><year iso-8601-date="2022">2022</year><data-title>BirdSongBout.</data-title><source>GitHub.</source><ext-link ext-link-type="uri" xlink:href="https://github.com/yardencsGitHub/BirdSongBout/tree/master/helpers/GUI">https://github.com/yardencsGitHub/BirdSongBout/tree/master/helpers/GUI</ext-link></element-citation></ref><ref id="bib19"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>da Costa-Luis</surname><given-names>CO</given-names></name></person-group><year iso-8601-date="2019">2019</year><article-title>tqdm: A Fast, Extensible Progress Meter for Python and CLI</article-title><source>Journal of Open Source Software</source><volume>4</volume><elocation-id>1277</elocation-id><pub-id pub-id-type="doi">10.21105/joss.01277</pub-id></element-citation></ref><ref id="bib20"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Daou</surname><given-names>A</given-names></name><name><surname>Johnson</surname><given-names>F</given-names></name><name><surname>Wu</surname><given-names>W</given-names></name><name><surname>Bertram</surname><given-names>R</given-names></name></person-group><year iso-8601-date="2012">2012</year><article-title>A computational tool for automated large-scale analysis and measurement of bird-song syntax</article-title><source>Journal of Neuroscience Methods</source><volume>210</volume><fpage>147</fpage><lpage>160</lpage><pub-id pub-id-type="doi">10.1016/j.jneumeth.2012.07.020</pub-id><pub-id pub-id-type="pmid">22890237</pub-id></element-citation></ref><ref id="bib21"><element-citation publication-type="software"><person-group person-group-type="author"><collab>Dask Development Team</collab></person-group><year iso-8601-date="2016">2016</year><data-title>Dask: Library for Dynamic Task Scheduling</data-title><version designator="0.1">0.1</version><source>Dask</source><ext-link ext-link-type="uri" xlink:href="https://docs.dask.org">https://docs.dask.org</ext-link></element-citation></ref><ref id="bib22"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Farabet</surname><given-names>C</given-names></name><name><surname>Couprie</surname><given-names>C</given-names></name><name><surname>Najman</surname><given-names>L</given-names></name><name><surname>Lecun</surname><given-names>Y</given-names></name></person-group><year iso-8601-date="2013">2013</year><article-title>Learning hierarchical features for scene labeling</article-title><source>IEEE Transactions on Pattern Analysis and Machine Intelligence</source><volume>35</volume><fpage>1915</fpage><lpage>1929</lpage><pub-id pub-id-type="doi">10.1109/TPAMI.2012.231</pub-id><pub-id pub-id-type="pmid">23787344</pub-id></element-citation></ref><ref id="bib23"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Fee</surname><given-names>MS</given-names></name><name><surname>Scharff</surname><given-names>C</given-names></name></person-group><year iso-8601-date="2010">2010</year><article-title>The songbird as a model for the generation and learning of complex sequential behaviors</article-title><source>ILAR Journal</source><volume>51</volume><fpage>362</fpage><lpage>377</lpage><pub-id pub-id-type="doi">10.1093/ilar.51.4.362</pub-id><pub-id pub-id-type="pmid">21131713</pub-id></element-citation></ref><ref id="bib24"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Fonseca</surname><given-names>AH</given-names></name><name><surname>Santana</surname><given-names>GM</given-names></name><name><surname>Ortiz</surname><given-names>GMB</given-names></name><name><surname>Bampi</surname><given-names>S</given-names></name><name><surname>Dietrich</surname><given-names>MO</given-names></name></person-group><year iso-8601-date="2021">2021</year><article-title>Analysis of ultrasonic vocalizations from mice using computer vision and machine learning</article-title><source>eLife</source><volume>10</volume><elocation-id>e59161</elocation-id><pub-id pub-id-type="doi">10.7554/eLife.59161</pub-id></element-citation></ref><ref id="bib25"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gardner</surname><given-names>TJ</given-names></name><name><surname>Naef</surname><given-names>F</given-names></name><name><surname>Nottebohm</surname><given-names>F</given-names></name></person-group><year iso-8601-date="2005">2005</year><article-title>Freedom and rules: the acquisition and reprogramming of a bird’s learned song</article-title><source>Science</source><volume>308</volume><fpage>1046</fpage><lpage>1049</lpage><pub-id pub-id-type="doi">10.1126/science.1108214</pub-id><pub-id pub-id-type="pmid">15890887</pub-id></element-citation></ref><ref id="bib26"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Goffinet</surname><given-names>J</given-names></name><name><surname>Brudner</surname><given-names>S</given-names></name><name><surname>Mooney</surname><given-names>R</given-names></name><name><surname>Pearson</surname><given-names>J</given-names></name></person-group><year iso-8601-date="2021">2021</year><article-title>Low-dimensional learned feature spaces quantify individual and group differences in vocal repertoires</article-title><source>eLife</source><volume>10</volume><elocation-id>e67855</elocation-id><pub-id pub-id-type="doi">10.7554/eLife.67855</pub-id><pub-id pub-id-type="pmid">33988503</pub-id></element-citation></ref><ref id="bib27"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Goldman</surname><given-names>SA</given-names></name><name><surname>Nottebohm</surname><given-names>F</given-names></name></person-group><year iso-8601-date="1983">1983</year><article-title>Neuronal production, migration, and differentiation in a vocal control nucleus of the adult female canary brain</article-title><source>PNAS</source><volume>80</volume><fpage>2390</fpage><lpage>2394</lpage><pub-id pub-id-type="doi">10.1073/pnas.80.8.2390</pub-id><pub-id pub-id-type="pmid">6572982</pub-id></element-citation></ref><ref id="bib28"><element-citation publication-type="book"><person-group person-group-type="author"><name><surname>Goodfellow</surname><given-names>I</given-names></name><name><surname>Bengio</surname><given-names>Y</given-names></name><name><surname>Courville</surname><given-names>A</given-names></name></person-group><year iso-8601-date="2016">2016</year><source>Deep Learning</source><publisher-name>MIT press</publisher-name></element-citation></ref><ref id="bib29"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Graves</surname><given-names>A</given-names></name><name><surname>Schmidhuber</surname><given-names>J</given-names></name></person-group><year iso-8601-date="2005">2005</year><article-title>Framewise phoneme classification with bidirectional LSTM and other neural network architectures</article-title><source>Neural Networks</source><volume>18</volume><fpage>602</fpage><lpage>610</lpage><pub-id pub-id-type="doi">10.1016/j.neunet.2005.06.042</pub-id><pub-id pub-id-type="pmid">16112549</pub-id></element-citation></ref><ref id="bib30"><element-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Graves</surname><given-names>A</given-names></name><name><surname>Fernández</surname><given-names>S</given-names></name><name><surname>Gomez</surname><given-names>F</given-names></name><name><surname>Schmidhuber</surname><given-names>J</given-names></name></person-group><year iso-8601-date="2006">2006</year><conf-name>Connectionist temporal classification: Labelling unsegmented sequence data with recurrent neural networks</conf-name><article-title>In Proceedings of the 23rd International Conference on Machine Learning</article-title><fpage>369</fpage><lpage>376</lpage><pub-id pub-id-type="doi">10.1145/1143844.1143891</pub-id></element-citation></ref><ref id="bib31"><element-citation publication-type="book"><person-group person-group-type="author"><name><surname>Graves</surname><given-names>A</given-names></name></person-group><year iso-8601-date="2012">2012</year><chapter-title>Supervised sequence labelling</chapter-title><person-group person-group-type="editor"><name><surname>Graves</surname><given-names>A</given-names></name></person-group><source>In Supervised Sequence Labelling with Recurrent Neural Networks</source><publisher-name>Springer</publisher-name><fpage>5</fpage><lpage>13</lpage><pub-id pub-id-type="doi">10.1007/978-3-642-24797-2</pub-id></element-citation></ref><ref id="bib32"><element-citation publication-type="software"><person-group person-group-type="author"><name><surname>Grisel</surname><given-names>O</given-names></name><name><surname>Mueller</surname><given-names>O</given-names></name><name><surname>Gramfort</surname><given-names>A</given-names></name><name><surname>Louppe</surname><given-names>G</given-names></name></person-group><year iso-8601-date="2020">2020</year><data-title>Scikit-Learn/Scikit-Learn: Scikit-Learn</data-title><version designator="0.24.0">0.24.0</version><source>Scikit-Learn</source><ext-link ext-link-type="uri" xlink:href="https://scikit-learn.org/stable/">https://scikit-learn.org/stable/</ext-link></element-citation></ref><ref id="bib33"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Harris</surname><given-names>CR</given-names></name><name><surname>Millman</surname><given-names>KJ</given-names></name><name><surname>van der Walt</surname><given-names>SJ</given-names></name><name><surname>Gommers</surname><given-names>R</given-names></name><name><surname>Virtanen</surname><given-names>P</given-names></name><name><surname>Cournapeau</surname><given-names>D</given-names></name><name><surname>Wieser</surname><given-names>E</given-names></name><name><surname>Taylor</surname><given-names>J</given-names></name><name><surname>Berg</surname><given-names>S</given-names></name><name><surname>Smith</surname><given-names>NJ</given-names></name><name><surname>Kern</surname><given-names>R</given-names></name><name><surname>Picus</surname><given-names>M</given-names></name><name><surname>Hoyer</surname><given-names>S</given-names></name><name><surname>van Kerkwijk</surname><given-names>MH</given-names></name><name><surname>Brett</surname><given-names>M</given-names></name><name><surname>Haldane</surname><given-names>A</given-names></name><name><surname>Del Río</surname><given-names>JF</given-names></name><name><surname>Wiebe</surname><given-names>M</given-names></name><name><surname>Peterson</surname><given-names>P</given-names></name><name><surname>Gérard-Marchant</surname><given-names>P</given-names></name><name><surname>Sheppard</surname><given-names>K</given-names></name><name><surname>Reddy</surname><given-names>T</given-names></name><name><surname>Weckesser</surname><given-names>W</given-names></name><name><surname>Abbasi</surname><given-names>H</given-names></name><name><surname>Gohlke</surname><given-names>C</given-names></name><name><surname>Oliphant</surname><given-names>TE</given-names></name></person-group><year iso-8601-date="2020">2020</year><article-title>Array programming with NumPy</article-title><source>Nature</source><volume>585</volume><fpage>357</fpage><lpage>362</lpage><pub-id pub-id-type="doi">10.1038/s41586-020-2649-2</pub-id><pub-id pub-id-type="pmid">32939066</pub-id></element-citation></ref><ref id="bib34"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hedley</surname><given-names>RW</given-names></name></person-group><year iso-8601-date="2016">2016</year><article-title>Complexity, Predictability and Time Homogeneity of Syntax in the Songs of Cassin’s Vireo (Vireo cassinii)</article-title><source>PLOS ONE</source><volume>11</volume><elocation-id>e0150822</elocation-id><pub-id pub-id-type="doi">10.1371/journal.pone.0150822</pub-id><pub-id pub-id-type="pmid">27050537</pub-id></element-citation></ref><ref id="bib35"><element-citation publication-type="preprint"><person-group person-group-type="author"><name><surname>Heusel</surname><given-names>M</given-names></name><name><surname>Ramsauer</surname><given-names>H</given-names></name><name><surname>Unterthiner</surname><given-names>T</given-names></name><name><surname>Nessler</surname><given-names>B</given-names></name><name><surname>Hochreiter</surname><given-names>S</given-names></name></person-group><year iso-8601-date="2017">2017</year><article-title>GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium</article-title><source>arXiv</source><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1706.08500">https://arxiv.org/abs/1706.08500</ext-link></element-citation></ref><ref id="bib36"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hunter</surname><given-names>JD</given-names></name></person-group><year iso-8601-date="2007">2007</year><article-title>Matplotlib: A 2D Graphics Environment</article-title><source>Computing in Science &amp; Engineering</source><volume>9</volume><fpage>90</fpage><lpage>95</lpage><pub-id pub-id-type="doi">10.1109/MCSE.2007.55</pub-id></element-citation></ref><ref id="bib37"><element-citation publication-type="book"><person-group person-group-type="author"><name><surname>James</surname><given-names>G</given-names></name><name><surname>Witten</surname><given-names>D</given-names></name><name><surname>Hastie</surname><given-names>T</given-names></name><name><surname>Tibshirani</surname><given-names>R</given-names></name></person-group><year iso-8601-date="2013">2013</year><source>An Introduction to Statistical Learning</source><publisher-name>Springer</publisher-name><pub-id pub-id-type="doi">10.1007/978-1-4614-7138-7</pub-id></element-citation></ref><ref id="bib38"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jin</surname><given-names>DZ</given-names></name><name><surname>Kozhevnikov</surname><given-names>AA</given-names></name><name><surname>Friston</surname><given-names>KJ</given-names></name></person-group><year iso-8601-date="2011">2011</year><article-title>A Compact Statistical Model of the Song Syntax in Bengalese Finch</article-title><source>PLOS Computational Biology</source><volume>7</volume><elocation-id>e1001108</elocation-id><pub-id pub-id-type="doi">10.1371/journal.pcbi.1001108</pub-id><pub-id pub-id-type="pmid">21445230</pub-id></element-citation></ref><ref id="bib39"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kakishita</surname><given-names>Y</given-names></name><name><surname>Sasahara</surname><given-names>K</given-names></name><name><surname>Nishino</surname><given-names>T</given-names></name><name><surname>Takahasi</surname><given-names>M</given-names></name><name><surname>Okanoya</surname><given-names>K</given-names></name></person-group><year iso-8601-date="2008">2008</year><article-title>Ethological data mining: an automata-based approach to extract behavioral units and rules</article-title><source>Data Mining and Knowledge Discovery</source><volume>18</volume><fpage>446</fpage><lpage>471</lpage><pub-id pub-id-type="doi">10.1007/s10618-008-0122-1</pub-id></element-citation></ref><ref id="bib40"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kershenbaum</surname><given-names>A</given-names></name><name><surname>Blumstein</surname><given-names>DT</given-names></name><name><surname>Roch</surname><given-names>MA</given-names></name><name><surname>Akçay</surname><given-names>Ç</given-names></name><name><surname>Backus</surname><given-names>G</given-names></name><name><surname>Bee</surname><given-names>MA</given-names></name><name><surname>Bohn</surname><given-names>K</given-names></name><name><surname>Cao</surname><given-names>Y</given-names></name><name><surname>Carter</surname><given-names>G</given-names></name><name><surname>Cäsar</surname><given-names>C</given-names></name><name><surname>Coen</surname><given-names>M</given-names></name><name><surname>DeRuiter</surname><given-names>SL</given-names></name><name><surname>Doyle</surname><given-names>L</given-names></name><name><surname>Edelman</surname><given-names>S</given-names></name><name><surname>Ferrer-i-Cancho</surname><given-names>R</given-names></name><name><surname>Freeberg</surname><given-names>TM</given-names></name><name><surname>Garland</surname><given-names>EC</given-names></name><name><surname>Gustison</surname><given-names>M</given-names></name><name><surname>Harley</surname><given-names>HE</given-names></name><name><surname>Huetz</surname><given-names>C</given-names></name><name><surname>Hughes</surname><given-names>M</given-names></name><name><surname>Hyland Bruno</surname><given-names>J</given-names></name><name><surname>Ilany</surname><given-names>A</given-names></name><name><surname>Jin</surname><given-names>DZ</given-names></name><name><surname>Johnson</surname><given-names>M</given-names></name><name><surname>Ju</surname><given-names>C</given-names></name><name><surname>Karnowski</surname><given-names>J</given-names></name><name><surname>Lohr</surname><given-names>B</given-names></name><name><surname>Manser</surname><given-names>MB</given-names></name><name><surname>McCowan</surname><given-names>B</given-names></name><name><surname>Mercado</surname><given-names>E</given-names><suffix>III</suffix></name><name><surname>Narins</surname><given-names>PM</given-names></name><name><surname>Piel</surname><given-names>A</given-names></name><name><surname>Rice</surname><given-names>M</given-names></name><name><surname>Salmi</surname><given-names>R</given-names></name><name><surname>Sasahara</surname><given-names>K</given-names></name><name><surname>Sayigh</surname><given-names>L</given-names></name><name><surname>Shiu</surname><given-names>Y</given-names></name><name><surname>Taylor</surname><given-names>C</given-names></name><name><surname>Vallejo</surname><given-names>EE</given-names></name><name><surname>Waller</surname><given-names>S</given-names></name><name><surname>Zamora-Gutierrez</surname><given-names>V</given-names></name></person-group><year iso-8601-date="2016">2016</year><article-title>Acoustic sequences in non-human animals: a tutorial review and prospectus</article-title><source>Biological Reviews</source><volume>91</volume><fpage>13</fpage><lpage>52</lpage><pub-id pub-id-type="doi">10.1111/brv.12160</pub-id><pub-id pub-id-type="pmid">25428267</pub-id></element-citation></ref><ref id="bib41"><element-citation publication-type="preprint"><person-group person-group-type="author"><name><surname>Kingma</surname><given-names>DP</given-names></name><name><surname>Ba</surname><given-names>J</given-names></name></person-group><year iso-8601-date="2014">2014</year><article-title>Adam: A Method for Stochastic Optimization</article-title><source>arXiv</source><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1412.6980">https://arxiv.org/abs/1412.6980</ext-link></element-citation></ref><ref id="bib42"><element-citation publication-type="book"><person-group person-group-type="author"><name><surname>Kluyver</surname><given-names>T</given-names></name><name><surname>Ragan-Kelley</surname><given-names>B</given-names></name><name><surname>Pérez</surname><given-names>F</given-names></name><name><surname>Granger</surname><given-names>BE</given-names></name><name><surname>Bussonnier</surname><given-names>M</given-names></name><name><surname>Frederic</surname><given-names>J</given-names></name><name><surname>Kelley</surname><given-names>K</given-names></name><name><surname>Hamrick</surname><given-names>JB</given-names></name><name><surname>Grout</surname><given-names>J</given-names></name><name><surname>Corlay</surname><given-names>S</given-names></name></person-group><year iso-8601-date="2016">2016</year><source>Jupyter Notebooks-a Publishing Format for Reproducible Computational Workflows</source><publisher-name>ELPUB.</publisher-name><pub-id pub-id-type="doi">10.3233/978-1-61499-649-1-87</pub-id></element-citation></ref><ref id="bib43"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kogan</surname><given-names>JA</given-names></name><name><surname>Margoliash</surname><given-names>D</given-names></name></person-group><year iso-8601-date="1998">1998</year><article-title>Automated recognition of bird song elements from continuous recordings using dynamic time warping and hidden Markov models: A comparative study</article-title><source>The Journal of the Acoustical Society of America</source><volume>103</volume><fpage>2185</fpage><lpage>2196</lpage><pub-id pub-id-type="doi">10.1121/1.421364</pub-id><pub-id pub-id-type="pmid">9566338</pub-id></element-citation></ref><ref id="bib44"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Koumura</surname><given-names>T</given-names></name></person-group><year iso-8601-date="2016">2016</year><article-title>BirdsongRecognition</article-title><source>Figshare</source><volume>1</volume><elocation-id>3470165</elocation-id><pub-id pub-id-type="doi">10.6084/m9.figshare.3470165.v1</pub-id></element-citation></ref><ref id="bib45"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Koumura</surname><given-names>T</given-names></name><name><surname>Okanoya</surname><given-names>K</given-names></name></person-group><year iso-8601-date="2016">2016</year><article-title>Automatic Recognition of Element Classes and Boundaries in the Birdsong with Variable Sequences</article-title><source>PLOS ONE</source><volume>11</volume><elocation-id>e0159188</elocation-id><pub-id pub-id-type="doi">10.1371/journal.pone.0159188</pub-id><pub-id pub-id-type="pmid">27442240</pub-id></element-citation></ref><ref id="bib46"><element-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Krizhevsky</surname><given-names>A</given-names></name><name><surname>Sutskever</surname><given-names>I</given-names></name><name><surname>Hinton</surname><given-names>GE</given-names></name></person-group><year iso-8601-date="2012">2012</year><conf-name>Curran Associates</conf-name><article-title>Advances in Neural Information Processing Systems</article-title><fpage>1097</fpage><lpage>1105</lpage></element-citation></ref><ref id="bib47"><element-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Lea</surname><given-names>C</given-names></name><name><surname>Flynn</surname><given-names>MD</given-names></name><name><surname>Vidal</surname><given-names>R</given-names></name><name><surname>Reiter</surname><given-names>A</given-names></name><name><surname>Hager</surname><given-names>GD</given-names></name></person-group><year iso-8601-date="2017">2017</year><conf-name>Temporal Convolutional Networks for Action Segmentation and Detection</conf-name><article-title>2017 IEEE Conference on Computer Vision and Pattern Recognition</article-title><pub-id pub-id-type="doi">10.1109/CVPR.2017.113</pub-id></element-citation></ref><ref id="bib48"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Leonardo</surname><given-names>A</given-names></name><name><surname>Fee</surname><given-names>MS</given-names></name></person-group><year iso-8601-date="2005">2005</year><article-title>Ensemble coding of vocal control in birdsong</article-title><source>The Journal of Neuroscience</source><volume>25</volume><fpage>652</fpage><lpage>661</lpage><pub-id pub-id-type="doi">10.1523/JNEUROSCI.3036-04.2005</pub-id><pub-id pub-id-type="pmid">15659602</pub-id></element-citation></ref><ref id="bib49"><element-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Marcel</surname><given-names>S</given-names></name><name><surname>Rodriguez</surname><given-names>Y</given-names></name></person-group><year iso-8601-date="2010">2010</year><conf-name>Torchvision the machine-vision package of torch the international conference</conf-name><article-title>Association for Computing Machinery</article-title><pub-id pub-id-type="doi">10.1145/1873951.1874254</pub-id></element-citation></ref><ref id="bib50"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Markowitz</surname><given-names>JE</given-names></name><name><surname>Ivie</surname><given-names>E</given-names></name><name><surname>Kligler</surname><given-names>L</given-names></name><name><surname>Gardner</surname><given-names>TJ</given-names></name></person-group><year iso-8601-date="2013">2013</year><article-title>Long-range order in canary song</article-title><source>PLOS Computational Biology</source><volume>9</volume><elocation-id>e1003052</elocation-id><pub-id pub-id-type="doi">10.1371/journal.pcbi.1003052</pub-id><pub-id pub-id-type="pmid">23658509</pub-id></element-citation></ref><ref id="bib51"><element-citation publication-type="software"><person-group person-group-type="author"><name><surname>Markowitz</surname><given-names>J</given-names></name></person-group><year iso-8601-date="2022">2022a</year><data-title>pst.</data-title><source>GitHub.</source><ext-link ext-link-type="uri" xlink:href="https://github.com/jmarkow/pst">https://github.com/jmarkow/pst</ext-link></element-citation></ref><ref id="bib52"><element-citation publication-type="software"><person-group person-group-type="author"><name><surname>Markowitz</surname><given-names>J</given-names></name></person-group><year iso-8601-date="2022">2022b</year><data-title>zftftb.</data-title><source>GitHub.</source><ext-link ext-link-type="uri" xlink:href="https://github.com/jmarkow/zftftb/blob/master/zftftb_pretty_sonogram.m">https://github.com/jmarkow/zftftb/blob/master/zftftb_pretty_sonogram.m</ext-link></element-citation></ref><ref id="bib53"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mets</surname><given-names>DG</given-names></name><name><surname>Brainard</surname><given-names>MS</given-names></name></person-group><year iso-8601-date="2018">2018a</year><article-title>An automated approach to the quantitation of vocalizations and vocal learning in the songbird</article-title><source>PLOS Computational Biology</source><volume>14</volume><elocation-id>e1006437</elocation-id><pub-id pub-id-type="doi">10.1371/journal.pcbi.1006437</pub-id><pub-id pub-id-type="pmid">30169523</pub-id></element-citation></ref><ref id="bib54"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mets</surname><given-names>DG</given-names></name><name><surname>Brainard</surname><given-names>MS</given-names></name></person-group><year iso-8601-date="2018">2018b</year><article-title>Genetic variation interacts with experience to determine interindividual differences in learned song</article-title><source>PNAS</source><volume>115</volume><fpage>421</fpage><lpage>426</lpage><pub-id pub-id-type="doi">10.1073/pnas.1713031115</pub-id></element-citation></ref><ref id="bib55"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mets</surname><given-names>DG</given-names></name><name><surname>Brainard</surname><given-names>MS</given-names></name></person-group><year iso-8601-date="2019">2019</year><article-title>Learning is enhanced by tailoring instruction to individual genetic differences</article-title><source>eLife</source><volume>8</volume><elocation-id>47216</elocation-id><pub-id pub-id-type="doi">10.7554/eLife.47216</pub-id></element-citation></ref><ref id="bib56"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mooney</surname><given-names>R</given-names></name></person-group><year iso-8601-date="2009">2009</year><article-title>Neurobiology of song learning</article-title><source>Current Opinion in Neurobiology</source><volume>19</volume><fpage>654</fpage><lpage>660</lpage><pub-id pub-id-type="doi">10.1016/j.conb.2009.10.004</pub-id><pub-id pub-id-type="pmid">19892546</pub-id></element-citation></ref><ref id="bib57"><element-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Nicholson</surname><given-names>D</given-names></name></person-group><year iso-8601-date="2016">2016</year><conf-name>Comparison of machine learning methods applied to birdsong element classification</conf-name><article-title>Python in Science Conference</article-title><pub-id pub-id-type="doi">10.25080/Majora-629e541a-008</pub-id></element-citation></ref><ref id="bib58"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nicholson</surname><given-names>D</given-names></name><name><surname>Queen</surname><given-names>JE</given-names></name><name><surname>Sober</surname><given-names>SJ</given-names></name></person-group><year iso-8601-date="2017">2017</year><article-title>Bengalese Finch song repository</article-title><source>Figshare</source><volume>1</volume><elocation-id>e5</elocation-id><pub-id pub-id-type="doi">10.6084/m9.4805749.v5</pub-id></element-citation></ref><ref id="bib59"><element-citation publication-type="software"><person-group person-group-type="author"><name><surname>Nicholson</surname><given-names>D</given-names></name><name><surname>Cohen</surname><given-names>Y</given-names></name></person-group><year iso-8601-date="2021">2021</year><data-title>vak</data-title><version designator="0.4.0">0.4.0</version><source>Zenodo</source><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.5809730">https://doi.org/10.5281/zenodo.5809730</ext-link></element-citation></ref><ref id="bib60"><element-citation publication-type="software"><person-group person-group-type="author"><name><surname>Nicholson</surname><given-names>D</given-names></name></person-group><year iso-8601-date="2021">2021a</year><data-title>crowsetta</data-title><version designator="3.1.0">3.1.0</version><source>Zenodo</source><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.5792224">https://doi.org/10.5281/zenodo.5792224</ext-link></element-citation></ref><ref id="bib61"><element-citation publication-type="software"><person-group person-group-type="author"><name><surname>Nicholson</surname><given-names>D</given-names></name></person-group><year iso-8601-date="2021">2021b</year><data-title>Hybrid-Vocal-Classifier</data-title><version designator="2d95256">2d95256</version><source>Github</source><ext-link ext-link-type="uri" xlink:href="https://github.com/NickleDave/hybrid-vocal-classifier">https://github.com/NickleDave/hybrid-vocal-classifier</ext-link></element-citation></ref><ref id="bib62"><element-citation publication-type="software"><person-group person-group-type="author"><name><surname>Nicholson</surname><given-names>D</given-names></name></person-group><year iso-8601-date="2021">2021c</year><data-title>NickleDave/evfuncs</data-title><version designator="0.3.2">0.3.2</version><source>Zenodo</source><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.5810988">https://doi.org/10.5281/zenodo.5810988</ext-link></element-citation></ref><ref id="bib63"><element-citation publication-type="software"><person-group person-group-type="author"><name><surname>Nicholson</surname><given-names>D</given-names></name></person-group><year iso-8601-date="2021">2021d</year><data-title>NickleDave/birdsong-recognition-dataset</data-title><version designator="0.3.0">0.3.0</version><source>Zenodo</source><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.5812494">https://doi.org/10.5281/zenodo.5812494</ext-link></element-citation></ref><ref id="bib64"><element-citation publication-type="software"><person-group person-group-type="author"><name><surname>Nicholson</surname><given-names>D</given-names></name></person-group><year iso-8601-date="2022">2022</year><data-title>TweetyNet.</data-title><source>GitHub.</source><ext-link ext-link-type="uri" xlink:href="https://github.com/yardencsGitHub/tweetynet">https://github.com/yardencsGitHub/tweetynet</ext-link></element-citation></ref><ref id="bib65"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nottebohm</surname><given-names>F</given-names></name></person-group><year iso-8601-date="1981">1981</year><article-title>A brain for all seasons: cyclical anatomical changes in song control nuclei of the canary brain</article-title><source>Science (New York, N.Y.)</source><volume>214</volume><fpage>1368</fpage><lpage>1370</lpage><pub-id pub-id-type="doi">10.1126/science.7313697</pub-id><pub-id pub-id-type="pmid">7313697</pub-id></element-citation></ref><ref id="bib66"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Otchy</surname><given-names>TM</given-names></name><name><surname>Wolff</surname><given-names>SBE</given-names></name><name><surname>Rhee</surname><given-names>JY</given-names></name><name><surname>Pehlevan</surname><given-names>C</given-names></name><name><surname>Kawai</surname><given-names>R</given-names></name><name><surname>Kempf</surname><given-names>A</given-names></name><name><surname>Gobes</surname><given-names>SMH</given-names></name><name><surname>Ölveczky</surname><given-names>BP</given-names></name></person-group><year iso-8601-date="2015">2015</year><article-title>Acute off-target effects of neural circuit manipulations</article-title><source>Nature</source><volume>528</volume><fpage>358</fpage><lpage>363</lpage><pub-id pub-id-type="doi">10.1038/nature16442</pub-id><pub-id pub-id-type="pmid">26649821</pub-id></element-citation></ref><ref id="bib67"><element-citation publication-type="software"><person-group person-group-type="author"><collab>pandas development team</collab></person-group><year iso-8601-date="2020">2020</year><data-title>pandas.</data-title><version designator="1.4.1">1.4.1</version><source>Zenodo.</source><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.3509134">https://doi.org/10.5281/zenodo.3509134</ext-link></element-citation></ref><ref id="bib68"><element-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Parascandolo</surname><given-names>G</given-names></name><name><surname>Huttunen</surname><given-names>H</given-names></name><name><surname>Virtanen</surname><given-names>T</given-names></name></person-group><year iso-8601-date="2016">2016</year><conf-name>Recurrent neural networks for polyphonic sound event detection in real life recordings</conf-name><article-title>2016 IEEE International Conference on Acoustics, Speech and Signal Processing</article-title><pub-id pub-id-type="doi">10.1109/ICASSP.2016.7472917</pub-id></element-citation></ref><ref id="bib69"><element-citation publication-type="software"><person-group person-group-type="author"><name><surname>Paszke</surname><given-names>A</given-names></name><name><surname>Gross</surname><given-names>S</given-names></name><name><surname>Chintala</surname><given-names>S</given-names></name><name><surname>Chanan</surname><given-names>G</given-names></name><name><surname>Yang</surname><given-names>E</given-names></name><name><surname>DeVito</surname><given-names>Z</given-names></name><name><surname>Lin</surname><given-names>Z</given-names></name><name><surname>Desmaison</surname><given-names>A</given-names></name><name><surname>Antiga</surname><given-names>L</given-names></name><name><surname>Lerer</surname><given-names>A</given-names></name></person-group><year iso-8601-date="2017">2017</year><data-title>Automatic differentiation in PyTorch</data-title><version designator="0.1">0.1</version><source>PyTorch</source><ext-link ext-link-type="uri" xlink:href="https://openreview.net/pdf?id=BJJsrmfCZ">https://openreview.net/pdf?id=BJJsrmfCZ</ext-link></element-citation></ref><ref id="bib70"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Pedregosa</surname><given-names>F</given-names></name><name><surname>Varoquaux</surname><given-names>G</given-names></name><name><surname>Gramfort</surname><given-names>A</given-names></name><name><surname>Michel</surname><given-names>V</given-names></name><name><surname>Thirion</surname><given-names>B</given-names></name><name><surname>Grisel</surname><given-names>O</given-names></name><name><surname>Blondel</surname><given-names>M</given-names></name><name><surname>Prettenhofer</surname><given-names>P</given-names></name><name><surname>Weiss</surname><given-names>R</given-names></name><name><surname>Dubourg</surname><given-names>V</given-names></name><name><surname>Vanderplas</surname><given-names>J</given-names></name><name><surname>Passos</surname><given-names>A</given-names></name><name><surname>Cournapeau</surname><given-names>D</given-names></name><name><surname>Brucher</surname><given-names>M</given-names></name><name><surname>Perrot</surname><given-names>M</given-names></name><name><surname>Duchesnay</surname><given-names>E</given-names></name></person-group><year iso-8601-date="2011">2011</year><article-title>Scikit-learn: Machine learning in Python</article-title><source>Journal of Machine Learning Research</source><volume>12</volume><fpage>2825</fpage><lpage>2830</lpage></element-citation></ref><ref id="bib71"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Prat</surname><given-names>Y</given-names></name><name><surname>Taub</surname><given-names>M</given-names></name><name><surname>Pratt</surname><given-names>E</given-names></name><name><surname>Yovel</surname><given-names>Y</given-names></name></person-group><year iso-8601-date="2017">2017</year><article-title>An annotated dataset of Egyptian fruit bat vocalizations across varying contexts and during vocal ontogeny</article-title><source>Scientific Data</source><volume>4</volume><elocation-id>170143</elocation-id><pub-id pub-id-type="doi">10.1038/sdata.2017.143</pub-id><pub-id pub-id-type="pmid">28972574</pub-id></element-citation></ref><ref id="bib72"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ron</surname><given-names>D</given-names></name><name><surname>Singer</surname><given-names>Y</given-names></name><name><surname>Tishby</surname><given-names>N</given-names></name></person-group><year iso-8601-date="1996">1996</year><article-title>The power of amnesia: Learning probabilistic automata with variable memory length</article-title><source>Machine Learning</source><volume>25</volume><fpage>117</fpage><lpage>149</lpage><pub-id pub-id-type="doi">10.1023/A:1026490906255</pub-id></element-citation></ref><ref id="bib73"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sainath</surname><given-names>TN</given-names></name><name><surname>Kingsbury</surname><given-names>B</given-names></name><name><surname>Mohamed</surname><given-names>AR</given-names></name><name><surname>Dahl</surname><given-names>GE</given-names></name><name><surname>Saon</surname><given-names>G</given-names></name><name><surname>Soltau</surname><given-names>H</given-names></name><name><surname>Beran</surname><given-names>T</given-names></name><name><surname>Aravkin</surname><given-names>AY</given-names></name><name><surname>Ramabhadran</surname><given-names>B</given-names></name></person-group><year iso-8601-date="2013">2013a</year><article-title>2013 IEEE Workshop on Automatic Speech Recognition &amp; Understanding</article-title><source>IEEEOlomouc</source><volume>1</volume><elocation-id>6707749</elocation-id><pub-id pub-id-type="doi">10.1109/ASRU.2013.6707749</pub-id></element-citation></ref><ref id="bib74"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sainath</surname><given-names>TN</given-names></name><name><surname>Mohamed</surname><given-names>AR</given-names></name><name><surname>Kingsbury</surname><given-names>B</given-names></name><name><surname>Ramabhadran</surname><given-names>B</given-names></name></person-group><year iso-8601-date="2013">2013b</year><article-title>ICASSP 2013 - 2013 IEEE International Conference on Acoustics, Speech and Signal Processing</article-title><source>IEEEVancouver</source><volume>1</volume><elocation-id>e6639347</elocation-id><pub-id pub-id-type="doi">10.1109/ICASSP.2013.6639347</pub-id></element-citation></ref><ref id="bib75"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sainburg</surname><given-names>T</given-names></name><name><surname>Thielk</surname><given-names>M</given-names></name><name><surname>Gentner</surname><given-names>T</given-names></name></person-group><year iso-8601-date="2019">2019</year><article-title>Animal Vocalization Generative Network (AVGN): A method for visualizing, understanding, and sampling from animal communicative repertoires</article-title><source>In CogSci</source><volume>1</volume><elocation-id>3563</elocation-id></element-citation></ref><ref id="bib76"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sainburg</surname><given-names>T</given-names></name><name><surname>Thielk</surname><given-names>M</given-names></name><name><surname>Gentner</surname><given-names>TQ</given-names></name></person-group><year iso-8601-date="2020">2020</year><article-title>Finding, visualizing, and quantifying latent structure across diverse animal vocal repertoires</article-title><source>PLOS Computational Biology</source><volume>16</volume><elocation-id>e1008228</elocation-id><pub-id pub-id-type="doi">10.1371/journal.pcbi.1008228</pub-id><pub-id pub-id-type="pmid">33057332</pub-id></element-citation></ref><ref id="bib77"><element-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Salimans</surname><given-names>T</given-names></name><name><surname>Goodfellow</surname><given-names>I</given-names></name><name><surname>Zaremba</surname><given-names>W</given-names></name><name><surname>Cheung</surname><given-names>V</given-names></name><name><surname>Radford</surname><given-names>A</given-names></name><name><surname>Chen</surname><given-names>X</given-names></name></person-group><year iso-8601-date="2016">2016</year><conf-name>Improved Techniques for Training Gans</conf-name><article-title>Advances in neural information processing systems</article-title><fpage>2234</fpage><lpage>2242</lpage></element-citation></ref><ref id="bib78"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Scheirer</surname><given-names>WJ</given-names></name><name><surname>de Rezende Rocha</surname><given-names>A</given-names></name><name><surname>Sapkota</surname><given-names>A</given-names></name><name><surname>Boult</surname><given-names>TE</given-names></name></person-group><year iso-8601-date="2013">2013</year><article-title>Toward open set recognition</article-title><source>IEEE Transactions on Pattern Analysis and Machine Intelligence</source><volume>35</volume><fpage>1757</fpage><lpage>1772</lpage><pub-id pub-id-type="doi">10.1109/TPAMI.2012.256</pub-id><pub-id pub-id-type="pmid">23682001</pub-id></element-citation></ref><ref id="bib79"><element-citation publication-type="software"><person-group person-group-type="author"><name><surname>Schlawack</surname><given-names>H</given-names></name></person-group><year iso-8601-date="2020">2020</year><data-title>Attrs</data-title><version designator="21.4.0">21.4.0</version><source>PYTHON MODULE INDEX</source><ext-link ext-link-type="uri" xlink:href="https://www.attrs.org/_/downloads/en/stable/pdf/">https://www.attrs.org/_/downloads/en/stable/pdf/</ext-link></element-citation></ref><ref id="bib80"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Searfoss</surname><given-names>AM</given-names></name><name><surname>Pino</surname><given-names>JC</given-names></name><name><surname>Creanza</surname><given-names>N</given-names></name><name><surname>Zamora‐Gutierrez</surname><given-names>V</given-names></name></person-group><year iso-8601-date="2020">2020</year><article-title>Chipper: Open‐source software for semi‐automated segmentation and analysis of birdsong and other natural sounds</article-title><source>Methods in Ecology and Evolution</source><volume>11</volume><fpage>524</fpage><lpage>531</lpage><pub-id pub-id-type="doi">10.1111/2041-210X.13368</pub-id></element-citation></ref><ref id="bib81"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sober</surname><given-names>SJ</given-names></name><name><surname>Wohlgemuth</surname><given-names>MJ</given-names></name><name><surname>Brainard</surname><given-names>MS</given-names></name></person-group><year iso-8601-date="2008">2008</year><article-title>Central contributions to acoustic variation in birdsong</article-title><source>The Journal of Neuroscience</source><volume>28</volume><fpage>10370</fpage><lpage>10379</lpage><pub-id pub-id-type="doi">10.1523/JNEUROSCI.2448-08.2008</pub-id><pub-id pub-id-type="pmid">18842896</pub-id></element-citation></ref><ref id="bib82"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sober</surname><given-names>SJ</given-names></name><name><surname>Brainard</surname><given-names>MS</given-names></name></person-group><year iso-8601-date="2009">2009</year><article-title>Adult birdsong is actively maintained by error correction</article-title><source>Nature Neuroscience</source><volume>12</volume><fpage>927</fpage><lpage>931</lpage><pub-id pub-id-type="doi">10.1038/nn.2336</pub-id><pub-id pub-id-type="pmid">19525945</pub-id></element-citation></ref><ref id="bib83"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sober</surname><given-names>SJ</given-names></name><name><surname>Brainard</surname><given-names>MS</given-names></name></person-group><year iso-8601-date="2012">2012</year><article-title>Vocal learning is constrained by the statistics of sensorimotor experience</article-title><source>PNAS</source><volume>109</volume><fpage>21099</fpage><lpage>21103</lpage><pub-id pub-id-type="doi">10.1073/pnas.1213622109</pub-id><pub-id pub-id-type="pmid">23213223</pub-id></element-citation></ref><ref id="bib84"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Suthers</surname><given-names>RA</given-names></name><name><surname>Vallet</surname><given-names>E</given-names></name><name><surname>Kreutzer</surname><given-names>M</given-names></name></person-group><year iso-8601-date="2012">2012</year><article-title>Bilateral coordination and the motor basis of female preference for sexual signals in canary song</article-title><source>The Journal of Experimental Biology</source><volume>215</volume><fpage>2950</fpage><lpage>2959</lpage><pub-id pub-id-type="doi">10.1242/jeb.071944</pub-id><pub-id pub-id-type="pmid">22875764</pub-id></element-citation></ref><ref id="bib85"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tachibana</surname><given-names>RO</given-names></name><name><surname>Oosugi</surname><given-names>N</given-names></name><name><surname>Okanoya</surname><given-names>K</given-names></name></person-group><year iso-8601-date="2014">2014</year><article-title>Semi-automatic classification of birdsong elements using a linear support vector machine</article-title><source>PLOS ONE</source><volume>9</volume><elocation-id>e92584</elocation-id><pub-id pub-id-type="doi">10.1371/journal.pone.0092584</pub-id><pub-id pub-id-type="pmid">24658578</pub-id></element-citation></ref><ref id="bib86"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tachibana</surname><given-names>RO</given-names></name><name><surname>Kanno</surname><given-names>K</given-names></name><name><surname>Okabe</surname><given-names>S</given-names></name><name><surname>Kobayasi</surname><given-names>KI</given-names></name><name><surname>Okanoya</surname><given-names>K</given-names></name></person-group><year iso-8601-date="2020">2020</year><article-title>USVSEG: A robust method for segmentation of ultrasonic vocalizations in rodents</article-title><source>PLOS ONE</source><volume>15</volume><elocation-id>e0228907</elocation-id><pub-id pub-id-type="doi">10.1371/journal.pone.0228907</pub-id><pub-id pub-id-type="pmid">32040540</pub-id></element-citation></ref><ref id="bib87"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Takahasi</surname><given-names>M</given-names></name><name><surname>Yamada</surname><given-names>H</given-names></name><name><surname>Okanoya</surname><given-names>K</given-names></name></person-group><year iso-8601-date="2010">2010</year><article-title>Statistical and Prosodic Cues for Song Segmentation Learning by Bengalese Finches (Lonchura striata var domestica)</article-title><source>Ethology: Formerly Zeitschrift Fur Tierpsychologie</source><volume>116</volume><fpage>481</fpage><lpage>489</lpage><pub-id pub-id-type="doi">10.1111/j.1439-0310.2010.01772.x</pub-id></element-citation></ref><ref id="bib88"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tchernichovski</surname><given-names>O</given-names></name><name><surname>Nottebohm</surname><given-names>F</given-names></name><name><surname>Ho</surname><given-names>CE</given-names></name><name><surname>Pesaran</surname><given-names>B</given-names></name><name><surname>Mitra</surname><given-names>PP</given-names></name></person-group><year iso-8601-date="2000">2000</year><article-title>A procedure for an automated measurement of song similarity</article-title><source>Animal Behaviour</source><volume>59</volume><fpage>1167</fpage><lpage>1176</lpage><pub-id pub-id-type="doi">10.1006/anbe.1999.1416</pub-id><pub-id pub-id-type="pmid">10877896</pub-id></element-citation></ref><ref id="bib89"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tchernichovski</surname><given-names>O</given-names></name><name><surname>Mitra</surname><given-names>PP</given-names></name><name><surname>Lints</surname><given-names>T</given-names></name><name><surname>Nottebohm</surname><given-names>F</given-names></name></person-group><year iso-8601-date="2001">2001</year><article-title>Dynamics of the vocal imitation process: how a zebra finch learns its song</article-title><source>Science</source><volume>291</volume><fpage>2564</fpage><lpage>2569</lpage><pub-id pub-id-type="doi">10.1126/science.1058522</pub-id><pub-id pub-id-type="pmid">11283361</pub-id></element-citation></ref><ref id="bib90"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Thompson</surname><given-names>NS</given-names></name><name><surname>LeDOUX</surname><given-names>K</given-names></name><name><surname>Moody</surname><given-names>K</given-names></name></person-group><year iso-8601-date="2012">2012</year><article-title>A system for describing bird song units</article-title><source>Bioacoustics</source><volume>5</volume><fpage>267</fpage><lpage>279</lpage><pub-id pub-id-type="doi">10.1080/09524622.1994.9753257</pub-id></element-citation></ref><ref id="bib91"><element-citation publication-type="software"><person-group person-group-type="author"><collab>Troyer lab</collab></person-group><year iso-8601-date="2012">2012</year><data-title>SongBrowser.</data-title><source>The University of Texas at San Antonio.</source><ext-link ext-link-type="uri" xlink:href="https://www.utsa.edu/troyerlab/software.html">https://www.utsa.edu/troyerlab/software.html</ext-link></element-citation></ref><ref id="bib92"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tumer</surname><given-names>EC</given-names></name><name><surname>Brainard</surname><given-names>MS</given-names></name></person-group><year iso-8601-date="2007">2007</year><article-title>Performance variability enables adaptive plasticity of “crystallized” adult birdsong</article-title><source>Nature</source><volume>450</volume><fpage>1240</fpage><lpage>1244</lpage><pub-id pub-id-type="doi">10.1038/nature06390</pub-id><pub-id pub-id-type="pmid">18097411</pub-id></element-citation></ref><ref id="bib93"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>van der Walt</surname><given-names>S</given-names></name><name><surname>Colbert</surname><given-names>SC</given-names></name><name><surname>Varoquaux</surname><given-names>G</given-names></name></person-group><year iso-8601-date="2011">2011</year><article-title>The NumPy Array: A Structure for Efficient Numerical Computation</article-title><source>Computing in Science &amp; Engineering</source><volume>13</volume><fpage>22</fpage><lpage>30</lpage><pub-id pub-id-type="doi">10.1109/MCSE.2011.37</pub-id></element-citation></ref><ref id="bib94"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Veit</surname><given-names>L</given-names></name><name><surname>Tian</surname><given-names>LY</given-names></name><name><surname>Monroy Hernandez</surname><given-names>CJ</given-names></name><name><surname>Brainard</surname><given-names>MS</given-names></name></person-group><year iso-8601-date="2021">2021</year><article-title>Songbirds can learn flexible contextual control over syllable sequencing</article-title><source>eLife</source><volume>10</volume><elocation-id>e61610</elocation-id><pub-id pub-id-type="doi">10.7554/eLife.61610</pub-id><pub-id pub-id-type="pmid">34060473</pub-id></element-citation></ref><ref id="bib95"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Virtanen</surname><given-names>P</given-names></name><name><surname>Gommers</surname><given-names>R</given-names></name><name><surname>Oliphant</surname><given-names>TE</given-names></name><name><surname>Haberland</surname><given-names>M</given-names></name><name><surname>Reddy</surname><given-names>T</given-names></name><name><surname>Cournapeau</surname><given-names>D</given-names></name><name><surname>Burovski</surname><given-names>E</given-names></name><name><surname>Peterson</surname><given-names>P</given-names></name><name><surname>Weckesser</surname><given-names>W</given-names></name><name><surname>Bright</surname><given-names>J</given-names></name><name><surname>van der Walt</surname><given-names>SJ</given-names></name><name><surname>Brett</surname><given-names>M</given-names></name><name><surname>Wilson</surname><given-names>J</given-names></name><name><surname>Millman</surname><given-names>KJ</given-names></name><name><surname>Mayorov</surname><given-names>N</given-names></name><name><surname>Nelson</surname><given-names>ARJ</given-names></name><name><surname>Jones</surname><given-names>E</given-names></name><name><surname>Kern</surname><given-names>R</given-names></name><name><surname>Larson</surname><given-names>E</given-names></name><name><surname>Carey</surname><given-names>CJ</given-names></name><name><surname>Polat</surname><given-names>İ</given-names></name><name><surname>Feng</surname><given-names>Y</given-names></name><name><surname>Moore</surname><given-names>EW</given-names></name><name><surname>VanderPlas</surname><given-names>J</given-names></name><name><surname>Laxalde</surname><given-names>D</given-names></name><name><surname>Perktold</surname><given-names>J</given-names></name><name><surname>Cimrman</surname><given-names>R</given-names></name><name><surname>Henriksen</surname><given-names>I</given-names></name><name><surname>Quintero</surname><given-names>EA</given-names></name><name><surname>Harris</surname><given-names>CR</given-names></name><name><surname>Archibald</surname><given-names>AM</given-names></name><name><surname>Ribeiro</surname><given-names>AH</given-names></name><name><surname>Pedregosa</surname><given-names>F</given-names></name><name><surname>van Mulbregt</surname><given-names>P</given-names></name><collab>SciPy 1.0 Contributors</collab></person-group><year iso-8601-date="2020">2020</year><article-title>Author Correction: SciPy 1.0: fundamental algorithms for scientific computing in Python</article-title><source>Nature Methods</source><volume>17</volume><fpage>261</fpage><lpage>272</lpage><pub-id pub-id-type="doi">10.1038/s41592-020-0772-5</pub-id><pub-id pub-id-type="pmid">32094914</pub-id></element-citation></ref><ref id="bib96"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Warren</surname><given-names>TL</given-names></name><name><surname>Charlesworth</surname><given-names>JD</given-names></name><name><surname>Tumer</surname><given-names>EC</given-names></name><name><surname>Brainard</surname><given-names>MS</given-names></name></person-group><year iso-8601-date="2012">2012</year><article-title>Variable sequencing is actively maintained in a well learned motor skill</article-title><source>The Journal of Neuroscience</source><volume>32</volume><fpage>15414</fpage><lpage>15425</lpage><pub-id pub-id-type="doi">10.1523/JNEUROSCI.1254-12.2012</pub-id><pub-id pub-id-type="pmid">23115179</pub-id></element-citation></ref><ref id="bib97"><element-citation publication-type="software"><person-group person-group-type="author"><name><surname>Waskom</surname><given-names>M</given-names></name><name><surname>Botvinnik</surname><given-names>O</given-names></name><name><surname>Gelbart</surname><given-names>M</given-names></name><name><surname>Ostblom</surname><given-names>J</given-names></name><name><surname>Hobson</surname><given-names>P</given-names></name><name><surname>Lukauskas</surname><given-names>S</given-names></name><name><surname>Gemperline</surname><given-names>DC</given-names></name><name><surname>Augspurger</surname><given-names>T</given-names></name><name><surname>Halchenko</surname><given-names>Y</given-names></name><name><surname>Warmenhoven</surname><given-names>J</given-names></name><name><surname>Cole</surname><given-names>JB</given-names></name><name><surname>Ruiter</surname><given-names>J</given-names></name><name><surname>Vanderplas</surname><given-names>J</given-names></name><name><surname>Hoyer</surname><given-names>S</given-names></name><name><surname>Pye</surname><given-names>C</given-names></name><name><surname>Miles</surname><given-names>A</given-names></name><name><surname>Swain</surname><given-names>C</given-names></name><name><surname>Meyer</surname><given-names>K</given-names></name><name><surname>Martin</surname><given-names>M</given-names></name><name><surname>Bachant</surname><given-names>P</given-names></name><name><surname>Quintero</surname><given-names>E</given-names></name></person-group><year iso-8601-date="2020">2020</year><data-title>Mwaskom/seaborn: (Sepetmber 2020)</data-title><version designator="V0.11.0">V0.11.0</version><source>Zenodo</source><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.4019146">https://doi.org/10.5281/zenodo.4019146</ext-link></element-citation></ref><ref id="bib98"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Waskom</surname><given-names>ML</given-names></name></person-group><year iso-8601-date="2021">2021</year><article-title>seaborn: statistical data visualization</article-title><source>Journal of Open Source Software</source><volume>6</volume><elocation-id>3021</elocation-id><pub-id pub-id-type="doi">10.21105/joss.03021</pub-id></element-citation></ref><ref id="bib99"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wilbrecht</surname><given-names>L</given-names></name><name><surname>Kirn</surname><given-names>JR</given-names></name></person-group><year iso-8601-date="2004">2004</year><article-title>Neuron addition and loss in the song system: regulation and function</article-title><source>Annals of the New York Academy of Sciences</source><volume>1016</volume><fpage>659</fpage><lpage>683</lpage><pub-id pub-id-type="doi">10.1196/annals.1298.024</pub-id><pub-id pub-id-type="pmid">15313799</pub-id></element-citation></ref><ref id="bib100"><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wohlgemuth</surname><given-names>MJ</given-names></name><name><surname>Sober</surname><given-names>SJ</given-names></name><name><surname>Brainard</surname><given-names>MS</given-names></name></person-group><year iso-8601-date="2010">2010</year><article-title>Linked control of syllable sequence and phonology in birdsong</article-title><source>The Journal of Neuroscience</source><volume>30</volume><fpage>12936</fpage><lpage>12949</lpage><pub-id pub-id-type="doi">10.1523/JNEUROSCI.2690-10.2010</pub-id><pub-id pub-id-type="pmid">20881112</pub-id></element-citation></ref></ref-list></back><sub-article article-type="editor-report" id="sa0"><front-stub><article-id pub-id-type="doi">10.7554/eLife.63853.sa0</article-id><title-group><article-title>Editor's evaluation</article-title></title-group><contrib-group><contrib contrib-type="author"><name><surname>Goldberg</surname><given-names>Jesse H</given-names></name><role specific-use="editor">Reviewing Editor</role><aff><institution-wrap><institution-id institution-id-type="ror">https://ror.org/05bnh6r87</institution-id><institution>Cornell University</institution></institution-wrap><country>United States</country></aff></contrib></contrib-group></front-stub><body><p>Animals create an enormous diversity of sounds. To study the neural basis or behavioral logic of animal communication, it is first necessary to categorize sounds into distinct types. Here, the authors create a novel neural network that includes an LSTM to enable automated annotation of massive birdsong datasets. This widely usable method will have a big impact in the birdsong field and, more generally, will provide an ascendant generation of scientists with yet another example of how machine learning methods are revolutionizing the rigorous study of animal behavior.</p></body></sub-article><sub-article article-type="decision-letter" id="sa1"><front-stub><article-id pub-id-type="doi">10.7554/eLife.63853.sa1</article-id><title-group><article-title>Decision letter</article-title></title-group><contrib-group content-type="section"><contrib contrib-type="editor"><name><surname>Goldberg</surname><given-names>Jesse H</given-names></name><role>Reviewing Editor</role><aff><institution-wrap><institution-id institution-id-type="ror">https://ror.org/05bnh6r87</institution-id><institution>Cornell University</institution></institution-wrap><country>United States</country></aff></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name><surname>Goldberg</surname><given-names>Jesse H</given-names></name><role>Reviewer</role><aff><institution-wrap><institution-id institution-id-type="ror">https://ror.org/05bnh6r87</institution-id><institution>Cornell University</institution></institution-wrap><country>United States</country></aff></contrib><contrib contrib-type="reviewer"><name><surname>Brainard</surname><given-names>Michael</given-names></name><role>Reviewer</role><aff><institution>University of California, San Fransisco</institution><country>United States</country></aff></contrib></contrib-group></front-stub><body><boxed-text id="box1"><p>In the interests of transparency, eLife publishes the most substantive revision requests and the accompanying author responses.</p></boxed-text><p><bold>Decision letter after peer review:</bold></p><p>Thank you for submitting your article &quot;TweetyNet: A neural network that enables high-throughput, automated annotation of birdsong&quot; for consideration by <italic>eLife</italic>. Your article has been reviewed by 3 peer reviewers, including Jesse H Goldberg as Reviewing Editor and Reviewer #1, and the evaluation has been overseen by Ronald Calabrese as the Senior Editor. The following individual involved in review of your submission has agreed to reveal their identity: Michael Brainard (Reviewer #3).</p><p>The reviewers have discussed the reviews with one another and the Reviewing Editor has drafted this decision to help you prepare a revised submission.</p><p>Summary:</p><p>Cohen and colleagues present an algorithm and software (TweetyNet) to facilitate automated propagation of labels to large sets of vocalizations, such as birdsongs. Currently, a variety of other imperfect methods including other automated algorithms as well as manual annotation are used to label the elements of songs. Especially for more complex songs such as those of the Bengalese finch and canary, as are the focus here, human labeling remains a gold standard, and this is problematic because of the amount of time required to label large data sets as well as the potential for inconsistencies in how humans apply labels. Hence, more advanced tools to facilitate the automated annotation of songs will be of significant value for the birdsong community and others working on analogous problems. The paper is very well written and the material that is presented in the main text is largely accessible to a non-technical audience of potential users. The authors provide useful statistics on the performance of Tweetynet in labeling syllables from both BFs and canary songs, and show that it does a better job than a previous approach that, like TN, relies both on local acoustic structure and sequencing of acoustic elements in applying labels. Overall, the authors do a compelling job of presenting TN as a useful tool that may facilitate new findings derived from analyzing larger datasets than is currently feasible. As the authors note, this approach is not perfect, but they provide a valuable framework and software package that will enable others to build on their approach.</p><p>Essential revisions:</p><p>(1) All reviewers agreed there was a lack of clarity over how TN is working &quot;under the hood&quot; and choice of parameters.</p><p>1.1. The architecture is clearly laid out in Figure 4 but a much better job of showing visualizations of how an example spectrogram is annotated will be helpful. For example, grayscale matrices (as in the bottom panels of Figure 8) could appear earlier in the paper with bengalese finch examples to really show readers how each ~2.7 ms timebin (frame) is being assessed, and how each frame's label is converted into a gap-annotated syllable-gap sequence. I'd like to see some kind of visualization that brings the reader from a syllable in a spectogram to the matrix, to the argmax function through to annotation will be helpful from the outset. The schematic could, if possible, clarify how the LSTM is operating over frames including how many it holds in memory.</p><p>1.2. Given what we know from ASR, the pooling step is likely very important for reducing unwanted input variance. Thus, more details are important here regarding the choice of the max operation, and a performance comparison to other potential types of pooling that justifies the choice. In particular, it is known that max pooling is particularly susceptible to over fitting (Goodfellow et al., 2013), and so some discussion of whether the likely gains that are provided by max pooling are worth the potential costs is warranted. Relatedly, the dimensions of the pooling operation are not clear. In speech recognition it is common to see pooling in frequency (but not time), whereas in image recognition pooling in both spatial dimensions is common. The exact form should be clarified, and (if possible) justified.</p><p>1.3 The use of the term 'majority vote' for the post-processing step was very confusing (for me at least) because at first read it appeared to be doing the same thing as the argmax function in the actual neural network. If I understand correctly, each 2.7 ms bin gets assigned a vector of length n (where n = number of syllables trained + silence). Syllables (a sequence of frames flanked by silence) will have a sequence of labels of length (sylldur)/2.7 ms. Argmax takes the most probable of those labels and annotates the syllable accordingly. So – intuitively, the argmax is basically taking a majority vote. In the postprocessing step, I think that after the network has annotated a given syllable, it will enforce that all frames inherit the label of the syllable. So if a 270 ms syllable was initially labeled 1111221111 (and called syllable 1) then the majority vote post-processing step will go back and override the frame labels as 1111111111. This probably helps with correct annotations at sequence transitions, but it's not exactly clear to me why this is the case unless the majority vote is occurring before a re-annotation step. If I understand this correctly (which I may not) – then the term majority vote is a bit misleading and should instead be called something like 'frame override.' If I am not understanding this correctly, then the authors should really clarify exactly what this postprocessing step is and why it works.</p><p>1.4. Relative influence of sequential versus local features. As I understand it, TN takes as input spectrograms in which each column of the spectrogram (corresponding to some small period of time) is a &quot;bin&quot;, and a set of labels for each bin that are derived from how the corresponding segments of song were labeled by manual annotation. The network is then trained on &quot;windows&quot; That contain some large number (hundreds) of contiguous bins and their labels corresponding to many hundreds of milliseconds of song. I would appreciate some guidance regarding how performance depends on the relative influence of local acoustic structure (within a single bin, or within a single syllable) and more global sequential structure that depends on the specific sequence of syllables within-training windows. I assume that both of these will affect the labels that are applied to bins, but I have little intuition about their relative influences. Will this tend to work against the correct labeling of rare variants? For example if the sequence 'abc' is very prevalent in a song, and the sequence 'abb' is very rare, will the labeling of 'abb' be biased towards the labels 'abc'? More generally, it would be helpful to have a more pointed discussion of the degree to which TN performance depends on choices for parameters such as bin size and window size – is this something that a user might want to differently adjust for BF versus canary song versus other types of vocalizations?</p><p>1.5. Please describe the weight sharing strategy. Different frequency ranges are likely to have different local feature behaviors. For large numbers of hidden units it is likely that a full weight sharing strategy is most beneficial in the CNN, but this should be confirmed. If the authors investigated other weight sharing strategies, error rates for different weight sharing strategies should be compared under scenarios where the number of hidden units in the CCN are varied systematically. If not, they should provide a different justification for their current choice.</p><p>(2) Better clarify failure modes, and provide a more accurate description of TN limitations</p><p>All reviewers were surprised and skeptical at the suggestion that TN could be good for labeling juvenile songs, mouse vocalizations, and bat vocalizations. My impression is that the issues about human labeling are greatly exacerbated for these sorts of data sets, and that they have not proven to be very amenable to analyses that try to apply discrete categorical labels. So these sorts of data sets seem like they might be better approached with analysis schemes that do not rely on ascribing specific categorical labels. It would be helpful to elaborate on how TN would work in such cases, and perhaps illustrate with some juvenile finch songs or other highly variable and apparently non-categorical vocalizations (such as from mice or bats, see also point 3.1 below). Inclusion of such new analyses is not necessary for publication – but if such analyses are not conducted any claim of the utility of TN for these purposes should be dropped from the manuscript – to ensure that the results of the TN performance accurately match the claims made.</p><p>(3) Better integration with parallel work:</p><p>3.1. Beyond the forgoing justifications for the implementation of the specific networks presented in the paper, there is a broader question as to whether the supervised learning approach is an ideal solution for all or even most cases. A variety of unsupervised, self-supervised, and semi-supervised approaches are available. Of course, there are far too many to compare all of these, but some discussion of these alternatives at the end of the paper is warranted. In particular, DeepSqueak [Coffey et al., 2019] offers a similar CNN-driven front end but with unsupervised clustering of features from USVs (and see [Goffinet et al., biorxiv]). Other recent work [Sainburg et al., 2020] applies fully unsupervised approaches to find song syllables and directly compares automated clusters to hand labels for several songbird species (but not canaries). It seems these unsupervised approaches would be better suited for highly variable songs (e.g. babbling, budgerigar warble). If this is the case then the authors should explicitly say so – so readers know what conditions are suited for TN and what conditions are not.</p><p>3.2. The authors suggest that ref 39 represents state of prior art, but should elaborate: ref 39 shows substantially better performance than is attributed to it when given larger training sets or other variations in application, and my impression is that other methods may also do substantially better than ref 39, and therefore provide additional useful benchmarks for comparing TN performance. Some of these take an approach of segmenting syllables and then clustering and labelling syllables by their acoustic features, independently of sequential context. This forgoes the potential value of statistical regularities in sequence, but correspondingly avoids any potential issues associated with correct detection and labeling of syllables that occur in rare sequence variants, about which I have inquired above. I would be especially interested in seeing a comparison of TN relative to some version of those other approaches. For example, it would be nice to see how the Hybrid Vocal Classifier (HVC), developed by one of the co-authors (Nicholson), compares with TN in labeling of BF and canary songs. I would consider a comparison of performance between TN and HVC of value in assessing TN performance relative to other approaches that rely on individually segmented syllables. Another potential benchmark for comparison of TN performance would be the support vector machine approach in Tachibana, Oosugi, Okanoya (2014), which appears to suggest similar performance to TN, and may have a repository of labeled BF songs that would facilitate comparisons. It is not incumbent on the authors to test performance against all prior approaches, but without further evaluation against some alternatives, it would be appropriate to temper claims that TN is broadly better than other extant algorithms (at least in terms of syllable error rates for BF songs) and instead further empathize some of its specific differences and strengths relative to other methods (such as not requiring pre-segmentation, use of small training sets, implicit incorporation of sequential information, etc). In this respect the authors emphasize that TN was developed to deal with canary songs, and I would not be surprised if its performance relative to other algorithms was further differentiated when applied to these more complex (than BF) songs. In that respect, running some of the annotated canary songs through HVC or other labeling algorithms might further clarify the relative strengths of TN and could be a valuable addition.</p></body></sub-article><sub-article article-type="reply" id="sa2"><front-stub><article-id pub-id-type="doi">10.7554/eLife.63853.sa2</article-id><title-group><article-title>Author response</article-title></title-group></front-stub><body><disp-quote content-type="editor-comment"><p>Essential revisions:</p><p>(1) All reviewers agreed there was a lack of clarity over how TN is working &quot;under the hood&quot; and choice of parameters.</p><p>1.1. The architecture is clearly laid out in Figure 4 but a much better job of showing visualizations of how an example spectrogram is annotated will be helpful. For example, grayscale matrices (as in the bottom panels of Figure 8) could appear earlier in the paper with bengalese finch examples to really show readers how each ~2.7 ms timebin (frame) is being assessed, and how each frame's label is converted into a gap-annotated syllable-gap sequence. I'd like to see some kind of visualization that brings the reader from a syllable in a spectogram to the matrix, to the argmax function through to annotation will be helpful from the outset. The schematic could, if possible, clarify how the LSTM is operating over frames including how many it holds in memory.</p></disp-quote><p>Following reviewers' suggestions, we have revised what was Figure 4, now Figure 2. The figure now dedicates a separate panel (A) to illustrate the process of labeling an example spectrogram – including the processing of the deep network outputs (the grayscale matrices in what was Figure 8) to estimated syllable segments. In panel B the figure illustrates the properties of the constituent components (convolutional, recurrent, projection). We now make clear in the figure caption and in the methods that the bidirectional LSTM operates on the entire sequence in both directions.</p><disp-quote content-type="editor-comment"><p>1.2. Given what we know from ASR, the pooling step is likely very important for reducing unwanted input variance. Thus, more details are important here regarding the choice of the max operation, and a performance comparison to other potential types of pooling that justifies the choice. In particular, it is known that max pooling is particularly susceptible to over fitting (Goodfellow et al., 2013), and so some discussion of whether the likely gains that are provided by max pooling are worth the potential costs is warranted. Relatedly, the dimensions of the pooling operation are not clear. In speech recognition it is common to see pooling in frequency (but not time), whereas in image recognition pooling in both spatial dimensions is common. The exact form should be clarified, and (if possible) justified.</p></disp-quote><p>We thank the reviewers for pointing out this oversight we made when describing our model and how we developed it. We added details in the introduction and in the methods to address these concerns. Specifically we now cite relevant literature to justify our choice of max pooling, and underscore that previous work has shown no advantage of alternative types of pooling (Sainath et al., 2013). As the work we cite shows, all current state-of-the-art models for similar tasks use standard max pooling with no down-sampling in the time dimension. To make the dimensions of the pooling operation clear, we specifically state in the introduction that we use a stride of 1 in the temporal dimension to avoid down-sampling, and provide further details in the methods. To justify this choice, we cite appropriate literature which found that there was no advantage to using larger strides in the temporal dimension within pooling layers.</p><p>We describe the measures we took to avoid overfitting below. We agree with the reviewers that multiple methods to avoid overfitting exist. Specifically, Goodfellow 2013 discusses ‘Maxout’, an alternative to the ‘dropout’ technique. Both techniques are not related to max pooling and, to our knowledge, ‘Maxout’ is not widely adopted. Our understanding is that max pooling improves generalization (and thus helps avoid overfitting) because it encourages invariance to small shifts in features, but as the work we cited shows, there is little of this benefit seen when pooling in the time domain.</p><disp-quote content-type="editor-comment"><p>1.3 The use of the term 'majority vote' for the post-processing step was very confusing (for me at least) because at first read it appeared to be doing the same thing as the argmax function in the actual neural network. If I understand correctly, each 2.7 ms bin gets assigned a vector of length n (where n = number of syllables trained + silence). Syllables (a sequence of frames flanked by silence) will have a sequence of labels of length (sylldur)/2.7 ms. Argmax takes the most probable of those labels and annotates the syllable accordingly. So – intuitively, the argmax is basically taking a majority vote. In the postprocessing step, I think that after the network has annotated a given syllable, it will enforce that all frames inherit the label of the syllable. So if a 270 ms syllable was initially labeled 1111221111 (and called syllable 1) then the majority vote post-processing step will go back and override the frame labels as 1111111111. This probably helps with correct annotations at sequence transitions, but it's not exactly clear to me why this is the case unless the majority vote is occurring before a re-annotation step. If I understand this correctly (which I may not) – then the term majority vote is a bit misleading and should instead be called something like 'frame override.' If I am not understanding this correctly, then the authors should really clarify exactly what this postprocessing step is and why it works.</p></disp-quote><p>We agree with the reviewers that we did not clearly define and illustrate the term &quot;majority vote&quot;. To clearly define this term and clarify its relationship to argmax, we made several changes. When addressing revision point 1.1, we added a graphic illustration of how this clean-up step is performed to the architecture diagram. In the figure we conceptually separated, as two panels, the deep neural network (DNN) whose parameters are learned from the training data, and the steps converting the DNN’s output to segmented syllables. The latter (panel A in the figure) graphically separates the step assigning a single label to each spectrogram time bin (argmax), and what we called ‘majority vote’ – the step taking continuous runs of non-silence time bins and assigning them one label by a majority vote among them.</p><p>We now dedicate a section (“Post-processing neural network output and converting it to annotations”) to similarly make clear the relationship between the argmax and the majority vote steps. In addition we adapted the reviewer's suggested language to explain how the majority vote overrides the label that the network predicts. The text reads: “Second, we then take a &quot;majority vote&quot; by counting how many times each label is assigned to any time bin in a segment, and then assigning the most frequently occurring label to all time bins in the segment, overriding any others.”</p><disp-quote content-type="editor-comment"><p>1.4. Relative influence of sequential versus local features. As I understand it, TN takes as input spectrograms in which each column of the spectrogram (corresponding to some small period of time) is a &quot;bin&quot;, and a set of labels for each bin that are derived from how the corresponding segments of song were labeled by manual annotation. The network is then trained on &quot;windows&quot; That contain some large number (hundreds) of contiguous bins and their labels corresponding to many hundreds of milliseconds of song. I would appreciate some guidance regarding how performance depends on the relative influence of local acoustic structure (within a single bin, or within a single syllable) and more global sequential structure that depends on the specific sequence of syllables within-training windows. I assume that both of these will affect the labels that are applied to bins, but I have little intuition about their relative influences. Will this tend to work against the correct labeling of rare variants? For example if the sequence 'abc' is very prevalent in a song, and the sequence 'abb' is very rare, will the labeling of 'abb' be biased towards the labels 'abc'? More generally, it would be helpful to have a more pointed discussion of the degree to which TN performance depends on choices for parameters such as bin size and window size – is this something that a user might want to differently adjust for BF versus canary song versus other types of vocalizations?</p></disp-quote><p>We thank the reviewers for raising these important questions which we have addressed with extensive follow-up experiments to better understand the relative influence of local and global features. We did this in two ways: (1) by varying the size of the spectrogram window fed to the network and controlling the amount of global &quot;context&quot; that the network sees, and (2) by varying the size of the hidden state within the network’s recurrent layer and affecting the network’s ability to integrate information over time steps. We include the results in main figure 5. By running these experiments we show the network depends on both these variables, and that the hyperparameters we used for both Bengalese finches and canaries in other figures were not poorly chosen.</p><p>To address the question about errors in rare sequence variants we carefully identified and analyzed these variants in Bengalese finch data. For all sequences of syllables a-b we examined all possibilities for the following syllable and identified the most frequent sequence, a-b-x. Then, among all sequences a-b-y (y≠x) that are at least 4 times less frequent than a-b-x, we measured the frame error rate during the syllable y. This detailed analysis showed that there is a very small effect on rare variants. We include a description of this analysis in a methods section “Errors in rare sequences”. We include a supplementary figure to show this analysis. The figure shows that when using model parameters and training set durations, presented in the manuscript for well-trained models, there is no relation between error rates in syllables occurring in rare sequences and the rarity of the sequence. The figure also shows that in poorly-trained or parameter-impoverished models the likelihood of errors indeed increases in some rarely-occurring sequences. We believe this meets the reviewers’ prediction but since it only occurs in badly-formed or poorly-trained networks we do not include it in as a result but reference it from the main manuscript.</p><p>Finally, we have added language to the methods to further address questions about bin size and window size. We make it clear that users must choose a bin size that does not prevent the model from being able to segment the smallest occurring gaps in song. The results presented in figure 5 also help guide readers towards good starting points for choosing hyperparameters for Bengalese finches and canaries.</p><disp-quote content-type="editor-comment"><p>1.5. Please describe the weight sharing strategy. Different frequency ranges are likely to have different local feature behaviors. For large numbers of hidden units it is likely that a full weight sharing strategy is most beneficial in the CNN, but this should be confirmed. If the authors investigated other weight sharing strategies, error rates for different weight sharing strategies should be compared under scenarios where the number of hidden units in the CCN are varied systematically. If not, they should provide a different justification for their current choice.</p></disp-quote><p>We agree with the reviewers that it is important to make clear to researchers unfamiliar with neural networks that an advantage of convolutional neural networks is how they enable weight-sharing. We now describe how our architecture uses the common &quot;full&quot; weight sharing strategy. To our knowledge, studies of other forms of weight sharing did not show any clear improvements, and are not widely adopted; all state-of-the-art architectures for automatic speech recognition and audio event detection with convolutional layers simply use full weight sharing. In the methods, we cite &quot;early&quot; (Abdel-Hamid et al., 2014) work that specifically tested alternative weight sharing strategies for ASR, and note that this has not been widely adopted.</p><disp-quote content-type="editor-comment"><p>(2) Better clarify failure modes, and provide a more accurate description of TN limitations</p><p>All reviewers were surprised and skeptical at the suggestion that TN could be good for labeling juvenile songs, mouse vocalizations, and bat vocalizations. My impression is that the issues about human labeling are greatly exacerbated for these sorts of data sets, and that they have not proven to be very amenable to analyses that try to apply discrete categorical labels. So these sorts of data sets seem like they might be better approached with analysis schemes that do not rely on ascribing specific categorical labels. It would be helpful to elaborate on how TN would work in such cases, and perhaps illustrate with some juvenile finch songs or other highly variable and apparently non-categorical vocalizations (such as from mice or bats, see also point 3.1 below). Inclusion of such new analyses is not necessary for publication – but if such analyses are not conducted any claim of the utility of TN for these purposes should be dropped from the manuscript – to ensure that the results of the TN performance accurately match the claims made.</p></disp-quote><p>We recognize that our writing was unclear here and that it is confusing for readers. We revised language about applying TN to other vocalizations, such as highly variable juvenile finch song, so that it does not sound as if we are suggesting that these vocalizations could somehow be cleanly categorized into discrete labels. Instead we make it clear that TN can potentially be used as a binary classifier that simply classifies each time bin as &quot;vocalization&quot; or &quot;non-vocalization&quot;, in effect reproducing a user's cleaned-up segmentations, so that these segmented variable vocalizations can then be used for downstream analysis. To make it clear that these are not claims about our results, we have moved these suggestions to the &quot;Ideas and Speculation&quot; section of the discussion as suggested by <italic>eLife</italic> guidelines. We are in contact with other groups that are using TweetyNet for this exact purpose, but we feel that including such demonstrations is out of scope for the claims we make in the paper.</p><disp-quote content-type="editor-comment"><p>(3) Better integration with parallel work:</p><p>3.1. Beyond the forgoing justifications for the implementation of the specific networks presented in the paper, there is a broader question as to whether the supervised learning approach is an ideal solution for all or even most cases. A variety of unsupervised, self-supervised, and semi-supervised approaches are available. Of course, there are far too many to compare all of these, but some discussion of these alternatives at the end of the paper is warranted. In particular, DeepSqueak [Coffey et al., 2019] offers a similar CNN-driven front end but with unsupervised clustering of features from USVs (and see [Goffinet et al., biorxiv]). Other recent work [Sainburg et al., 2020] applies fully unsupervised approaches to find song syllables and directly compares automated clusters to hand labels for several songbird species (but not canaries). It seems these unsupervised approaches would be better suited for highly variable songs (e.g. babbling, budgerigar warble). If this is the case then the authors should explicitly say so – so readers know what conditions are suited for TN and what conditions are not.</p></disp-quote><p>We agree with the reviewers that it is very important to compare and contrast our approach with others. We have revised the existing references to the papers that reviewers cited so that the discussion very clearly states the questions that the reviewers raised. We have added our current understanding of how these approaches are related and when they can work together.</p><disp-quote content-type="editor-comment"><p>3.2. The authors suggest that ref 39 represents state of prior art, but should elaborate: ref 39 shows substantially better performance than is attributed to it when given larger training sets or other variations in application, and my impression is that other methods may also do substantially better than ref 39, and therefore provide additional useful benchmarks for comparing TN performance. Some of these take an approach of segmenting syllables and then clustering and labelling syllables by their acoustic features, independently of sequential context. This forgoes the potential value of statistical regularities in sequence, but correspondingly avoids any potential issues associated with correct detection and labeling of syllables that occur in rare sequence variants, about which I have inquired above. I would be especially interested in seeing a comparison of TN relative to some version of those other approaches. For example, it would be nice to see how the Hybrid Vocal Classifier (HVC), developed by one of the co-authors (Nicholson), compares with TN in labeling of BF and canary songs. I would consider a comparison of performance between TN and HVC of value in assessing TN performance relative to other approaches that rely on individually segmented syllables. Another potential benchmark for comparison of TN performance would be the support vector machine approach in Tachibana, Oosugi, Okanoya (2014), which appears to suggest similar performance to TN, and may have a repository of labeled BF songs that would facilitate comparisons. It is not incumbent on the authors to test performance against all prior approaches, but without further evaluation against some alternatives, it would be appropriate to temper claims that TN is broadly better than other extant algorithms (at least in terms of syllable error rates for BF songs) and instead further empathize some of its specific differences and strengths relative to other methods (such as not requiring pre-segmentation, use of small training sets, implicit incorporation of sequential information, etc). In this respect the authors emphasize that TN was developed to deal with canary songs, and I would not be surprised if its performance relative to other algorithms was further differentiated when applied to these more complex (than BF) songs. In that respect, running some of the annotated canary songs through HVC or other labeling algorithms might further clarify the relative strengths of TN and could be a valuable addition.</p></disp-quote><p>We thank the reviewer for pointing out the need to better compare with other methods. We reached out to the authors of ref 39 and after discussion realized that there was a crucial difference between our methods and theirs. After communicating with the authors and doing further careful inspection of the annotation, we realized that the paper (ref 39) only selects certain sequences within much longer songs. As a result, the paper does not benchmark how well their algorithms perform when annotating entire songs. The central goal of our algorithm is to annotate entire songs end-to-end with a single model. Therefore we removed direct comparisons between our methods and theirs, although of course we still cite it as related work. Because we realized upon close inspection of the dataset that there were sequences that were left unannotated, we chose four birds and completely annotated all songs for those birds. We provide these complete annotations with the datasets we are submitting so that readers would still be able to use the song to benchmark ours and other models.</p><p>We carried the analysis the reviewers suggested. The first figure in the results now compares TweetyNet with a support vector machine (SVM) model built into the `hvc` library that uses features from Tachibana 2014. The reviewers are correct, it is clear that given cleanly segmented audio, SVM models can perform quite well for Bengalese finch song. Our results also make it quite clear why TweetyNet is needed for canary song, since there are no reliable segmentation algorithms, and even with cleanly segmented data by human annotators, the difference between TweetyNet and SVM performance is negligible. Accordingly, we have also tempered claims and changed language to be more specific about the strengths and weaknesses of different approaches, as suggested.</p></body></sub-article></article>