<html lang="en">

  <head>
    <title>Automated annotation of birdsong with a neural network that segments spectrograms</title>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta http-equiv="X-UA-Compatible" content="ie=edge">
    <link href="https://unpkg.com/@stencila/thema@2/dist/themes/elife/styles.css" rel="stylesheet">
    <script src="https://unpkg.com/@stencila/thema@2/dist/themes/elife/index.js"
      type="text/javascript"></script>
    <script
      src="https://unpkg.com/@stencila/components@&lt;=1/dist/stencila-components/stencila-components.esm.js"
      type="module"></script>
    <script
      src="https://unpkg.com/@stencila/components@&lt;=1/dist/stencila-components/stencila-components.js"
      type="text/javascript" nomodule=""></script>
  </head>

  <body>
    <main role="main">
      <article itemscope="" itemtype="http://schema.org/Article" data-itemscope="root">
        <h1 itemprop="headline">Automated annotation of birdsong with a neural network that segments
          spectrograms</h1>
        <meta itemprop="image"
          content="https://via.placeholder.com/1200x714/dbdbdb/4a4a4a.png?text=Automated%20annotation%20of%20birdsong%20with%20a%20neural%20network%20that%20segments%20spectrograms">
        <ol data-itemprop="authors">
          <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
            <meta itemprop="name" content="Yarden Cohen"><span data-itemprop="givenNames"><span
                itemprop="givenName">Yarden</span></span><span data-itemprop="familyNames"><span
                itemprop="familyName">Cohen</span></span><span data-itemprop="emails"><a
                itemprop="email"
                href="mailto:yarden.j.cohen@weizmann.ac.il">yarden.j.cohen@weizmann.ac.il</a></span><span
              data-itemprop="affiliations"><a itemprop="affiliation"
                href="#author-organization-1">1</a></span>
          </li>
          <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
            <meta itemprop="name" content="David Aaron Nicholson"><span
              data-itemprop="givenNames"><span itemprop="givenName">David</span><span
                itemprop="givenName">Aaron</span></span><span data-itemprop="familyNames"><span
                itemprop="familyName">Nicholson</span></span><span data-itemprop="affiliations"><a
                itemprop="affiliation" href="#author-organization-2">2</a></span>
          </li>
          <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
            <meta itemprop="name" content="Alexa Sanchioni"><span data-itemprop="givenNames"><span
                itemprop="givenName">Alexa</span></span><span data-itemprop="familyNames"><span
                itemprop="familyName">Sanchioni</span></span><span data-itemprop="affiliations"><a
                itemprop="affiliation" href="#author-organization-3">3</a></span>
          </li>
          <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
            <meta itemprop="name" content="Emily K Mallaber"><span data-itemprop="givenNames"><span
                itemprop="givenName">Emily</span><span itemprop="givenName">K</span></span><span
              data-itemprop="familyNames"><span itemprop="familyName">Mallaber</span></span><span
              data-itemprop="affiliations"><a itemprop="affiliation"
                href="#author-organization-3">3</a></span>
          </li>
          <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
            <meta itemprop="name" content="Viktoriya Skidanova"><span
              data-itemprop="givenNames"><span itemprop="givenName">Viktoriya</span></span><span
              data-itemprop="familyNames"><span itemprop="familyName">Skidanova</span></span><span
              data-itemprop="affiliations"><a itemprop="affiliation"
                href="#author-organization-3">3</a></span>
          </li>
          <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
            <meta itemprop="name" content="Timothy J Gardner"><span data-itemprop="givenNames"><span
                itemprop="givenName">Timothy</span><span itemprop="givenName">J</span></span><span
              data-itemprop="familyNames"><span itemprop="familyName">Gardner</span></span><span
              data-itemprop="emails"><a itemprop="email"
                href="mailto:timg@uoregon.edu">timg@uoregon.edu</a></span><span
              data-itemprop="affiliations"><a itemprop="affiliation"
                href="#author-organization-4">4</a></span>
          </li>
        </ol>
        <ol data-itemprop="affiliations">
          <li itemscope="" itemtype="http://schema.org/Organization" itemid="#author-organization-1"
            id="author-organization-1"><span itemprop="name">Rehovot</span><address itemscope=""
              itemtype="http://schema.org/PostalAddress" itemprop="address"><span
                itemprop="addressCountry">Israel</span></address></li>
          <li itemscope="" itemtype="http://schema.org/Organization" itemid="#author-organization-2"
            id="author-organization-2"><span itemprop="name">Atlanta</span><address itemscope=""
              itemtype="http://schema.org/PostalAddress" itemprop="address"><span
                itemprop="addressCountry">United States</span></address></li>
          <li itemscope="" itemtype="http://schema.org/Organization" itemid="#author-organization-3"
            id="author-organization-3"><span itemprop="name">Boston</span><address itemscope=""
              itemtype="http://schema.org/PostalAddress" itemprop="address"><span
                itemprop="addressCountry">United States</span></address></li>
          <li itemscope="" itemtype="http://schema.org/Organization" itemid="#author-organization-4"
            id="author-organization-4"><span itemprop="name">Eugene</span><address itemscope=""
              itemtype="http://schema.org/PostalAddress" itemprop="address"><span
                itemprop="addressCountry">United States</span></address></li>
        </ol><span itemscope="" itemtype="http://schema.org/Organization" itemprop="publisher">
          <meta itemprop="name" content="Unknown"><span itemscope=""
            itemtype="http://schema.org/ImageObject" itemprop="logo">
            <meta itemprop="url"
              content="https://via.placeholder.com/600x60/dbdbdb/4a4a4a.png?text=Unknown">
          </span>
        </span><time itemprop="datePublished" datetime="2022-01-20">2022-01-20</time>
        <ul data-itemprop="genre">
          <li itemprop="genre">Tools and Resources</li>
        </ul>
        <ul data-itemprop="about">
          <li itemscope="" itemtype="http://schema.org/DefinedTerm" itemprop="about"><span
              itemprop="name">Neuroscience</span></li>
        </ul>
        <ul data-itemprop="keywords">
          <li itemprop="keywords">songbirds</li>
          <li itemprop="keywords">machine learning algorithms</li>
          <li itemprop="keywords">automated annotation</li>
          <li itemprop="keywords">canaries</li>
          <li itemprop="keywords">bengalese finches</li>
          <li itemprop="keywords">song syntax</li>
          <li itemprop="keywords">neural network</li>
          <li itemprop="keywords">sound event detection</li>
          <li itemprop="keywords">Other</li>
        </ul>
        <ul data-itemprop="identifiers">
          <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
            <meta itemprop="propertyID"
              content="https://registry.identifiers.org/registry/publisher-id"><span
              itemprop="name">publisher-id</span><span itemprop="value"
              data-itemtype="http://schema.org/Number">63853</span>
          </li>
          <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
            <meta itemprop="propertyID" content="https://registry.identifiers.org/registry/doi">
            <span itemprop="name">doi</span><span itemprop="value">10.7554/eLife.63853</span>
          </li>
          <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
            <meta itemprop="propertyID"
              content="https://registry.identifiers.org/registry/elocation-id"><span
              itemprop="name">elocation-id</span><span itemprop="value">e63853</span>
          </li>
        </ul>
        <section data-itemprop="description">
          <h2 data-itemtype="http://schema.stenci.la/Heading">Abstract</h2>
          <meta itemprop="description"
            content="Songbirds provide a powerful model system for studying sensory-motor learning. However, many analyses of birdsong require time-consuming, manual annotation of its elements, called syllables. Automated methods for annotation have been proposed, but these methods assume that audio can be cleanly segmented into syllables, or they require carefully tuning multiple statistical models. Here, we present TweetyNet: a single neural network model that learns how to segment spectrograms of birdsong into annotated syllables. We show that TweetyNet mitigates limitations of methods that rely on segmented audio. We also show that TweetyNet performs well across multiple individuals from two species of songbirds, Bengalese finches and canaries. Lastly, we demonstrate that using TweetyNet we can accurately annotate very large datasets containing multiple days of song, and that these predicted annotations replicate key findings from behavioral studies. In addition, we provide open-source software to assist other researchers, and a large dataset of annotated canary song that can serve as a benchmark. We conclude that TweetyNet makes it possible to address a wide range of new questions about birdsong.">
          <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Songbirds provide a powerful
            model system for studying sensory-motor learning. However, many analyses of birdsong
            require time-consuming, manual annotation of its elements, called syllables. Automated
            methods for annotation have been proposed, but these methods assume that audio can be
            cleanly segmented into syllables, or they require carefully tuning multiple statistical
            models. Here, we present TweetyNet: a single neural network model that learns how to
            segment spectrograms of birdsong into annotated syllables. We show that TweetyNet
            mitigates limitations of methods that rely on segmented audio. We also show that
            TweetyNet performs well across multiple individuals from two species of songbirds,
            Bengalese finches and canaries. Lastly, we demonstrate that using TweetyNet we can
            accurately annotate very large datasets containing multiple days of song, and that these
            predicted annotations replicate key findings from behavioral studies. In addition, we
            provide open-source software to assist other researchers, and a large dataset of
            annotated canary song that can serve as a benchmark. We conclude that TweetyNet makes it
            possible to address a wide range of new questions about birdsong.</p>
        </section>
        <h2 itemscope="" itemtype="http://schema.stenci.la/Heading" id="introduction">Introduction
        </h2>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Songbirds are an excellent
          model system for investigating sensory-motor learning and production of sequential
          behavior. Birdsong is a culturally transmitted behavior learned by imitation <cite
            itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib56"><span>56</span><span>Mooney</span><span>2009</span></a></cite>.
          Juveniles typically learn song from a tutor, like babies learning to talk. Their songs
          consist of vocal gestures executed in sequence <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib23"><span>23</span><span>Fee and
                Scharff</span><span>2010</span></a></cite>. In this and many other ways, birdsong
          resembles speech <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib12"><span>12</span><span>Brainard and
                Doupe</span><span>2002</span></a></cite>. A key advantage of songbirds as a model
          system is that birds sing spontaneously, producing hundreds of song bouts a day. Their
          natural behavior yields a detailed readout of how learned vocalizations are acquired
          during development and maintained in adulthood. Leveraging this amount of data requires
          methods for high-throughput automated analyses. For example, automated methods for
          measuring similarity of juvenile and tutor song across development <span itemscope=""
            itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib88"><span>88</span><span>Tchernichovski et
                  al.</span><span>2000</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib53"><span>53</span><span>Mets and
                  Brainard</span><span>2018</span></a></cite></span> led to important advances in
          understanding the behavioral and genetic bases of how vocalizations are learned <span
            itemscope="" itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib89"><span>89</span><span>Tchernichovski et
                  al.</span><span>2001</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib54"><span>54</span><span>Mets and
                  Brainard</span><span>2018</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib55"><span>55</span><span>Mets and
                  Brainard</span><span>2019</span></a></cite></span>. However, similarly scaling up
          other analyses of vocal behavior is currently hindered by a lack of automated methods.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">A major roadblock to scaling up
          many analyses is that they require researchers to annotate song. Annotation is a
          time-consuming process done by hand with graphical user interface (GUI) applications, for
          example Praat, Audacity, Chipper <span itemscope=""
            itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib11"><span>11</span><span>Boersma
                  and Weenink</span><span>2021</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib8"><span>8</span><span>Audacity
                  Team</span><span>2019</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib80"><span>80</span><span>Searfoss
                  et al.</span><span>2020</span></a></cite></span>. To annotate birdsong,
          researchers follow a two-step process <span itemscope=""
            itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib90"><span>90</span><span>Thompson
                  et al.</span><span>2012</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib40"><span>40</span><span>Kershenbaum et
                  al.</span><span>2016</span></a></cite></span>. First, they segment song into
          units, often called syllables, and second, they assign each syllable a label. Labels
          correspond to a set of discrete syllable classes that a researcher defines for each
          individual bird. Many models and analyses rely on song annotated at the syllable level,
          including: statistical models of syntax <span itemscope=""
            itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib50"><span>50</span><span>Markowitz et
                  al.</span><span>2013</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib38"><span>38</span><span>Jin et
                  al.</span><span>2011</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib9"><span>9</span><span>Berwick et
                  al.</span><span>2011</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib34"><span>34</span><span>Hedley</span><span>2016</span></a></cite></span>;
          computational models of motor learning <span itemscope=""
            itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib82"><span>82</span><span>Sober
                  and Brainard</span><span>2009</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib83"><span>83</span><span>Sober
                  and Brainard</span><span>2012</span></a></cite></span>; and analyses that relate
          both acoustic features and sequencing of syllables to neural activity <span itemscope=""
            itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib48"><span>48</span><span>Leonardo
                  and Fee</span><span>2005</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib81"><span>81</span><span>Sober et
                  al.</span><span>2008</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib100"><span>100</span><span>Wohlgemuth et
                  al.</span><span>2010</span></a></cite></span>. As these examples demonstrate, our
          ability to leverage songbirds as a model system would be greatly increased if we could
          automate song annotation.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Many previously proposed
          methods for automating annotation follow the same two-step process used when annotating
          manually. We describe the process in more detail, as illustrated in <a href="#fig1"
            itemscope="" itemtype="http://schema.stenci.la/Link">Figure 1A</a>, to make it clear how
          limitations can arise when automating these two steps. First, audio is segmented into
          syllables by applying a widely-used simple algorithm. Basically, the algorithm consists of
          setting a threshold on amplitude and finding each uninterrupted series of time points
          above that threshold. After segmentation, manual annotation proceeds with a researcher
          assigning labels to syllables (letters [’i’, ’a’, ’b’,…] in <a href="#fig1" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 1A</a>). We emphasize that each
          individual bird will have a unique song, even though songs are recognizably similar within
          a species, and that a researcher chooses an arbitrary set of labels for each individual’s
          repertoire of syllables. This means that any automated method must be capable of reliably
          classifying these arbitrary classes across individuals and species.</p>
        <figure itemscope="" itemtype="http://schema.stenci.la/Figure" id="fig1" title="Figure 1.">
          <label data-itemprop="label">Figure 1.</label><img
            src="index.html.media/elife-63853-fig1-v2.jpg" alt="" itemscope=""
            itemtype="http://schema.org/ImageObject">
          <figcaption>
            <h3 itemscope="" itemtype="http://schema.stenci.la/Heading"
              id="manual-annotation-of-birdsong">Manual annotation of birdsong.</h3>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">(<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">A</strong>) Schematic of the standard
              two-step process for annotating song by hand (e.g. with a GUI application). Top axes
              show a spectrogram generated from a brief clip of Bengalese finch song, with different
              syllable types. Middle and bottom axes show the steps of annotation: first, segments
              are extracted from song by setting a threshold (’thr.’, dashed horizontal line, bottom
              axes) on the amplitude and then finding continuous periods above that threshold
              (colored regions of amplitude trace, bottom axes). This produces segments (colored
              bars, middle axes) that an expert human annotator manually labels (characters above
              colored bars), assigning each segment to one of the syllable classes that the
              annotator defines for each individual bird. (<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">B</strong>) Examples showing how the
              standard approach of segmenting with a fixed amplitude threshold does not work well
              for canary song. Above threshold amplitudes are plotted as thicker colored lines. For
              a fixed threshold (dotted line, bottom axes), syllables of type ’b’ are correctly
              segmented, but syllables of type &#39;a&#39; are incorrectly segmented into two
              components, and syllables of type &#39;c&#39; are not segmented.</p>
          </figcaption>
        </figure>
        <figure itemscope="" itemtype="http://schema.stenci.la/Figure" id="fig1s1"
          title="Figure 1—figure supplement 1."><label data-itemprop="label">Figure 1—figure
            supplement 1.</label><img src="index.html.media/elife-63853-fig1-figsupp1-v2.jpg" alt=""
            itemscope="" itemtype="http://schema.org/ImageObject">
          <figcaption>
            <h3 itemscope="" itemtype="http://schema.stenci.la/Heading"
              id="example-of-two-consecutive-canary-phrases-that-differ-mostly-in-inter-syllable-gaps">
              Example of two consecutive canary phrases that differ mostly in inter-syllable gaps.
            </h3>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">In this case, annotation
              methods that first segment syllables and then use acoustic parameters to classify them
              will introduce errors. By simultaneously learning acoustic and sequence properties,
              TweetyNet overcomes this weakness.</p>
          </figcaption>
        </figure>
        <figure itemscope="" itemtype="http://schema.stenci.la/Figure" id="fig1s2"
          title="Figure 1—figure supplement 2."><label data-itemprop="label">Figure 1—figure
            supplement 2.</label><img src="index.html.media/elife-63853-fig1-figsupp2-v2.jpg" alt=""
            itemscope="" itemtype="http://schema.org/ImageObject">
          <figcaption>
            <h3 itemscope="" itemtype="http://schema.stenci.la/Heading"
              id="comparison-of-descriptive-statistics-of-birdsong-syllables-across-species">
              Comparison of descriptive statistics of birdsong syllables across species.</h3>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">(<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">A</strong>) The zebra finch repeating
              motif allows annotation by matching its template spectrogram without segmenting
              different syllables (colored bars). (<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">B</strong>) Bengalese finch songs
              segmented to syllables shows variable transitions and changing numbers of syllable
              repeats. (<strong itemscope="" itemtype="http://schema.stenci.la/Strong">C</strong>) A
              third of one domestic canary song of median duration segmented to syllables reveals
              repetitions (phrase) structure. (<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">D</strong>) The median, 0.25 and 0.75
              quantiles of song durations (x-axis) and number of syllables per song (y-axis) for two
              canary strains, Bengalese finches and Zebra finches (color coded) (<strong
                itemscope="" itemtype="http://schema.stenci.la/Strong">E</strong>) Variable songs
              are not suited for template matching. Songs contain repeating sequences of syllables
              but because of sequence variability songs with more syllables (x-axis) share smaller
              sequence fractions (y-axis) (<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">F</strong>) Distributions of syllable
              duration for one domestic canary. The bird had 20 different syllable types (x-axis,
              ordered by mean syllable duration). Box plot shows median, 0.25 and 0.75 quantiles of
              syllable durations. Whiskers show the entire range.</p>
          </figcaption>
        </figure>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Previous attempts to automate
          the annotation of birdsong kept the segmentation and labeling steps separate, and
          therefore suffered from limitations in each step. Methods such as semi-automatic
          clustering <span itemscope="" itemtype="http://schema.stenci.la/CiteGroup"><cite
              itemscope="" itemtype="http://schema.stenci.la/Cite"><a
                href="#bib13"><span>13</span><span>Burkett et
                  al.</span><span>2015</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib20"><span>20</span><span>Daou et
                  al.</span><span>2012</span></a></cite></span>, and supervised machine learning
          algorithms <span itemscope="" itemtype="http://schema.stenci.la/CiteGroup"><cite
              itemscope="" itemtype="http://schema.stenci.la/Cite"><a
                href="#bib91"><span>91</span><span>Troyer
                  lab</span><span>2012</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib85"><span>85</span><span>Tachibana et
                  al.</span><span>2014</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib57"><span>57</span><span>Nicholson</span><span>2016</span></a></cite></span>,
          can fail when the song of a species is not reliably segmented using the standard algorithm
          just described. We illustrate this in <a href="#fig1" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 1B</a> with examples of song from
          canaries. One reason the standard algorithm does not work is that the amplitude of canary
          song varies so greatly that no single threshold reliably segments all syllables. Even for
          species where good segmenting parameters can be found, a given individual’s song will
          often have one or two syllable classes that require an annotator to clean up its onsets
          and offsets by hand. Furthermore, other sounds in the environment, such as beak clicks and
          movement noise, are inevitably segmented as if they were syllables. Machine learning
          models operating on segmented audio will happily assign these segments a syllable class,
          resulting in false positives. Various other statistical methods can be used to remove
          these false positives, such as outlier detection algorithms. In combination with such
          methods, supervised machine learning models have been used to successfully annotate
          large-scale behavioral experiments (e.g. <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib94"><span>94</span><span>Veit et
                al.</span><span>2021</span></a></cite>). But these additional clean-up steps add
          complexity and require the researcher to perform further tuning and validation.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Automated annotation methods
          may also face limitations at the step of labeling segments. Many machine learning models
          make use of pre-defined, engineered features, that may not reliably discriminate different
          classes of syllables across individual birds or species. Likewise, features extracted from
          single syllables do not capture temporal dependencies, that if taken into account can
          improve the classification accuracy <span itemscope=""
            itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib6"><span>6</span><span>Anderson
                  et al.</span><span>1996</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib43"><span>43</span><span>Kogan
                  and Margoliash</span><span>1998</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib57"><span>57</span><span>Nicholson</span><span>2016</span></a></cite></span>.
          (An example where temporal features are needed is shown in <a href="#fig1s1" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 1—figure supplement 1</a>.) This issue
          with models that do not leverage temporal information becomes particularly important for
          species whose song has more variable sequencing (see <a href="#fig1s2" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 1—figure supplement 2</a>), like the
          Bengalese finch and canary song we study here. Such issues likely account for why there is
          no prior work on algorithms for automated annotation of canary song at the syllable level.
          Canaries have provided unique insights into neuronal regeneration, seasonality,
          interhemispheric coordination, hormones, and behavior <span itemscope=""
            itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib27"><span>27</span><span>Goldman
                  and Nottebohm</span><span>1983</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib65"><span>65</span><span>Nottebohm</span><span>1981</span></a></cite><cite
              itemscope="" itemtype="http://schema.stenci.la/Cite"><a
                href="#bib84"><span>84</span><span>Suthers et
                  al.</span><span>2012</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib99"><span>99</span><span>Wilbrecht and
                  Kirn</span><span>2004</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib4"><span>4</span><span>Alvarez-Buylla et
                  al.</span><span>1990</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib25"><span>25</span><span>Gardner
                  et al.</span><span>2005</span></a></cite></span>. In spite of this, canary song
          with its rich syllable repertoire and complex song syntax <span itemscope=""
            itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib50"><span>50</span><span>Markowitz et
                  al.</span><span>2013</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib3"><span>3</span><span>Alonso et
                  al.</span><span>2009</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib7"><span>7</span><span>Appeltants
                  et al.</span><span>2005</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib2"><span>2</span><span>Alliende
                  et al.</span><span>2013</span></a></cite></span> remains understudied, as does the
          similarly complex song of many other species.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Given the limitations faced by
          existing methods, we sought to develop an algorithm for automated annotation of syllables
          that (1) does not require cleanly segmented audio when predicting labels, (2) only
          requires training a single model, and (3) does not rely on hand-engineered features. To
          meet these criteria, we developed a deep neural network that we call TweetyNet, shown in
          <a href="#fig2" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 2</a>. Deep
          neural network models have the potential to address our criteria, because they can learn
          features from the training data itself, and they can be designed to map directly from
          spectrograms of song to predicted annotations, eliminating the need to segment audio.
          Below we test whether TweetyNet meets our criteria. To do so, we benchmark TweetyNet on
          Bengalese finch and canary song. We demonstrate that TweetyNet achieves robust performance
          across species and individuals, whose song can vary widely even within a species, and
          across many bouts of song from one individual, i.e., across days of song. Using large
          datasets from actual behavioral experiments, we show that automated annotations produced
          by TweetyNet replicate key findings about the syntax of song in both species.</p>
        <figure itemscope="" itemtype="http://schema.stenci.la/Figure" id="fig2" title="Figure 2.">
          <label data-itemprop="label">Figure 2.</label><img
            src="index.html.media/elife-63853-fig2-v2.jpg" alt="" itemscope=""
            itemtype="http://schema.org/ImageObject">
          <figcaption>
            <h3 itemscope="" itemtype="http://schema.stenci.la/Heading"
              id="tweetynet-operation-and-architecture">TweetyNet operation and architecture.</h3>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">(<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">A</strong>) TweetyNet takes as input a
              window from a spectrogram, and produces as output an estimate of the probability that
              each time bin in the spectrogram window belongs to a class <span itemscope=""
                itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                    class="mjx-math" aria-label="c"><span class="mjx-mrow" aria-hidden="true"><span
                        class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                          style="padding-top: 0.225em; padding-bottom: 0.298em;">c</span></span></span></span></span></span>
              from the set of predefined syllable classes <span itemscope=""
                itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                    class="mjx-math" aria-label="C"><span class="mjx-mrow" aria-hidden="true"><span
                        class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                          style="padding-top: 0.519em; padding-bottom: 0.298em; padding-right: 0.045em;">C</span></span></span></span></span></span>.
              This output is processed to generate the labeled segments that annotations are
              composed of: (1) We apply the argmax operation to assign each time bin the class with
              the highest probability. (2) We use the ‘background’ class we add during training
              (indicated as ‘Bg.’) to find continuous segments of syllable class labels. (3) We
              post-process these segments, first discarding any segment shorter than a minimum
              duration (dashed circle on right side) and then taking a majority vote to assign each
              segment a single label (dashed circles on left side). (<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">B</strong>) TweetyNet maps inputs to
              outputs through a series of operations: (1) The convolutional blocks produce a set of
              feature maps by convolving (asterisk) their input and a set of learned filters
              (greyscale boxes). A max-pooling operation downsamples the feature maps in the
              frequency dimension. (2) The recurrent layer, designed to capture temporal
              dependencies, is made up of Long Short Term Memory (LSTM) units. We use a bidrectional
              LSTM that operates on the input sequence in both the forward (F) and backward (B)
              directions to produce a hidden state for each time step, modulated by learned weights
              in the LSTM units. (3) The hidden states are projected onto the different syllable
              classes by a final linear transformation, resulting in a vector of class probabilities
              for each time bin <span itemscope=""
                itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                    class="mjx-math" aria-label="t"><span class="mjx-mrow" aria-hidden="true"><span
                        class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                          style="padding-top: 0.372em; padding-bottom: 0.298em;">t</span></span></span></span></span></span>.
              For further details, please see section ‘Neural network architecture’ in Materials and
              methods.</p>
          </figcaption>
        </figure>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading" id="proposed-model">Proposed
          model</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">First we describe our approach
          in enough detail to provide context. As shown in <a href="#fig2" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 2</a>, a TweetyNet model takes as input a
          window from a spectrogram of song, and produces as output a label for each time bin of
          that spectrogram window. Because it labels each time bin in a spectrogram, TweetyNet does
          not require segmented audio to predict annotations. In order to recover segments from the
          network output, we add a ‘background’ class to the set of syllable labels (indicated as
          ‘Bg.’ in <a href="#fig2" itemscope="" itemtype="http://schema.stenci.la/Link">Figure
            2A</a>). This class includes the brief quiet intervals between syllables, as well as
          noises, such as beak clicks, wing flaps, and non-song calls. When predicting annotations
          for new data, we use these time bins classified as ‘background’ to find segmented
          syllables in the network’s output.</p>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="neural-network-architecture">Neural network architecture</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">The neural network architecture
          we developed is most closely related to those designed for event detection, as studied
          with audio <span itemscope="" itemtype="http://schema.stenci.la/CiteGroup"><cite
              itemscope="" itemtype="http://schema.stenci.la/Cite"><a
                href="#bib10"><span>10</span><span>Böck and
                  Schedl</span><span>2012</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib68"><span>68</span><span>Parascandolo et
                  al.</span><span>2016</span></a></cite></span> or video <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib47"><span>47</span><span>Lea et
                al.</span><span>2017</span></a></cite> data, where the task is to map a time series
          to a sequence of segments belonging to different event classes. Like those previous works,
          TweetyNet’s architecture combines two structural elements commonly found in neural
          networks, as shown in <a href="#fig2" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 2B</a>. The first element is a
          convolutional block, used in computer vision tasks to learn features from images <span
            itemscope="" itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib28"><span>28</span><span>Goodfellow et
                  al.</span><span>2016</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib22"><span>22</span><span>Farabet
                  et al.</span><span>2013</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib46"><span>46</span><span>Krizhevsky et
                  al.</span><span>2012</span></a></cite></span>. (The term ‘block’ refers to a group
          of operations.) The second element is a recurrent layer, often used to predict sequences
          <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib31"><span>31</span><span>Graves and
                Graves</span><span>2012</span></a></cite>. Specifically, we use a bidirectional Long
          Short-Term Memory (LSTM) layer that has been shown to successfully learn temporal
          correlations present in speech <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib31"><span>31</span><span>Graves and
                Graves</span><span>2012</span></a></cite>, music, and acoustic scenes <span
            itemscope="" itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib10"><span>10</span><span>Böck and
                  Schedl</span><span>2012</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib68"><span>68</span><span>Parascandolo et
                  al.</span><span>2016</span></a></cite></span>. Importantly, we maximized the
          information available to the recurrent layer by choosing parameters for the pooling
          operation in the convolutional blocks that <em itemscope=""
            itemtype="http://schema.stenci.la/Emphasis">did not</em> downsample in the temporal
          dimension. We made this choice based on previous work on automatic speech recognition
          <span itemscope="" itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib73"><span>73</span><span>Sainath
                  et al.</span><span>2013</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib74"><span>74</span><span>Sainath
                  et al.</span><span>2013</span></a></cite></span>. Please see section ‘Neural
          network architecture’ in Materials and methods for a more detailed description of the
          network architecture, parameters, and citations of relevant literature that motivated our
          design choices.</p>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="post-processing-neural-network-output-and-converting-it-to-annotations">
          Post-processing neural network output and converting it to annotations</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">In the results below, we show
          that we significantly reduce error by post-processing network outputs with two simple
          transformations. So that these results are clear, we now describe how we convert outputs
          to annotation, including post-processing. For each window from a spectrogram, the network
          outputs a matrix with shape (<span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="c"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.298em;">c</span></span></span></span></span></span>
          classes <span itemscope="" itemtype="http://schema.stenci.la/MathFragment"><span
              class="mjx-chtml"><span class="mjx-math"
                aria-label="{\displaystyle \times \phantom{\rule{2pt}{0ex}}t}"><span
                  class="mjx-mrow" aria-hidden="true"><span class="mjx-texatom"><span
                      class="mjx-mrow"><span class="mjx-mstyle"><span class="mjx-mrow"><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">×</span></span><span
                            class="mjx-texatom"><span class="mjx-mrow"><span
                                class="mjx-mphantom"><span class="mjx-mrow"><span class="mjx-mspace"
                                    style="width: 0.2em; height: 0px; background-color: black;"></span></span></span></span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.372em; padding-bottom: 0.298em;">t</span></span></span></span></span></span></span></span></span></span>
          time bins) (ignoring the batch dimension). Values along dimension <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="c"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.298em;">c</span></span></span></span></span></span>
          are the probabilities that the network assigns to each class label. Along that dimension,
          we apply the <span itemscope="" itemtype="http://schema.stenci.la/MathFragment"><span
              class="mjx-chtml"><span class="mjx-math" aria-label="\mathrm{arg}\mathrm{max}"><span
                  class="mjx-mrow" aria-hidden="true"><span class="mjx-texatom"><span
                      class="mjx-mrow"><span class="mjx-mi"><span class="mjx-char MJXc-TeX-main-R"
                          style="padding-top: 0.151em; padding-bottom: 0.372em;">a</span></span><span
                        class="mjx-mi"><span class="mjx-char MJXc-TeX-main-R"
                          style="padding-top: 0.151em; padding-bottom: 0.372em;">r</span></span><span
                        class="mjx-mi"><span class="mjx-char MJXc-TeX-main-R"
                          style="padding-top: 0.151em; padding-bottom: 0.519em;">g</span></span></span></span><span
                    class="mjx-texatom"><span class="mjx-mrow"><span class="mjx-mi"><span
                          class="mjx-char MJXc-TeX-main-R"
                          style="padding-top: 0.151em; padding-bottom: 0.372em;">m</span></span><span
                        class="mjx-mi"><span class="mjx-char MJXc-TeX-main-R"
                          style="padding-top: 0.151em; padding-bottom: 0.372em;">a</span></span><span
                        class="mjx-mi"><span class="mjx-char MJXc-TeX-main-R"
                          style="padding-top: 0.151em; padding-bottom: 0.372em;">x</span></span></span></span></span></span></span></span>
          operation (‘argmax’ in <a href="#fig2" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 2A</a>) to produce a vector of length
          <span itemscope="" itemtype="http://schema.stenci.la/MathFragment"><span
              class="mjx-chtml"><span class="mjx-math" aria-label="t"><span class="mjx-mrow"
                  aria-hidden="true"><span class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.372em; padding-bottom: 0.298em;">t</span></span></span></span></span></span>,
          where the value in each time bin is the class label that the network estimated had the
          highest probability of occurring in that time bin. We recover segments from this vector by
          finding all uninterrupted runs of syllable labels that are bordered by bins labeled with
          the ‘background’ class. We consider each of these continuous runs of syllable labels to be
          a segment.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">To clean up these segments, we
          apply two transformations (‘Post processing’ in <a href="#fig2" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 2A</a>). First, we remove any segment
          shorter than a minimum duration, specified by a user. Second, we then take a ‘majority
          vote’ by counting how many times each label is assigned to any time bin in a segment, and
          then assigning the most frequently occurring label to all time bins in the segment,
          overriding any others. To annotate an entire spectrogram corresponding to one bout of
          song, we feed consecutive windows from the spectrogram into a trained network, concatenate
          the output vectors of labeled timebins, and then apply the post-processing. Finally we
          convert the onset and offset of each predicted segment back to seconds, using the times
          associated with each bin in the spectrogram, and we convert the segment’s integer class
          label back to the character label assigned by human annotators.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Using the method just
          described, a single TweetyNet model trained end-to-end can successfully annotate entire
          bouts of song at the syllable level. We are aware of only one previous study that takes a
          similar approach, from <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib44"><span>44</span><span>Koumura</span><span>2016</span></a></cite>. That
          study evaluated pipelines combining a convolutional neural network for classifying
          spectrogram windows with additional models that learn to correctly predict sequences of
          labels (e.g. Hidden Markov Models). In contrast, TweetyNet is a single neural network
          trained end-to-end, meaning it does not require optimizing multiple models. That previous
          study also focused on annotating specific sequences of interest within a song. Here, our
          goal is to annotate entire song bouts, not specific sequences, so as to automate the
          process as much as possible.</p>
        <h2 itemscope="" itemtype="http://schema.stenci.la/Heading" id="results">Results</h2>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">We assess performance of
          TweetyNet in two ways. First, we benchmark TweetyNet as a machine learning model, adopting
          good practices from that literature. We use a metric that we call the syllable error rate,
          by analogy with the word error rate, the standard metric for automatic speech recognition.
          It is an edit distance, meaning its magnitude increases with the number of edits
          (insertions, deletions, and substitutions) required to ‘correct’ the predicted sequence of
          labels so that it matches the ground truth sequence. (For specifics, see ‘Metrics’ in
          Materials and methods.) The edit distance is normalized, converting it into a rate, as
          required to measure performance across sequences of different lengths. We show syllable
          error rate as a percentage throughout for readability. Thus, a 1.0% syllable error rate
          can be thought of as ‘one edit per every 100 syllable labels’. It should be noted, though,
          that the syllable error rate can grow larger than 100%, for example if a predicted
          sequence has many more labels than the original. As results below show, this metric is
          very informative when benchmarking a model such as ours.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">The second way we study
          TweetyNet’s performance is meant to align with the point-of-view of an experimentalist,
          who simply wants to know whether the annotations that TweetyNet produces are ‘good enough’
          to answer their research question. To that end, we show that annotations predicted by
          trained TweetyNet models recover key findings from behavioral studies in Bengalese finches
          and canaries, by fitting statistical models of song syntax to predicted annotations.</p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="tweetynet-avoids-limitations-that-arise-from-segmenting-audio">TweetyNet avoids
          limitations that arise from segmenting audio</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">To show that TweetyNet avoids
          issues that result from relying on segmented audio (as described in the 1 Introduction),
          we compare its performance with a model that predicts labels given engineered acoustic
          features extracted from segmented audio. Specifically, we use a Support Vector Machine
          (SVM) model and pre-defined features adapted from <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib85"><span>85</span><span>Tachibana
                et al.</span><span>2014</span></a></cite> as described in ‘Comparison with a Support
          Vector Machine model’ in Materials and methods. To compare these two models we generated
          learning curves, that plot performance as a function of the amount of manually-annotated
          training data.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">The core question is: how does
          each model perform when applied to unlabeled data that has been pre-processed as required,
          <em itemscope="" itemtype="http://schema.stenci.la/Emphasis">without</em> any additional
          inspection or arduous manual cleaning from a human expert? For the SVM, the unlabeled data
          is pre-processed by segmenting the audio, while for TweetyNet, the audio files are
          converted to spectrograms. For both models, data is pre-processed for prediction with the
          exact same parameters used to pre-process training data: for example, with the same
          amplitude threshold used to segment audio. To simulate this for the SVM, we needed to
          re-segment the audio of the test set, because the segments in the ground truth annotations
          have been carefully cleaned by expert human annotators. We then obtained SVM predictions
          for these ‘raw’ segments. For all training set sizes, the syllable error rate of the SVM
          given ‘raw’ segments was higher than the syllable error rate of TweetyNet, as can be seen
          by comparing the dotted orange lines and the solid blue lines in <a href="#fig3"
            itemscope="" itemtype="http://schema.stenci.la/Link">Figure 3</a>.</p>
        <figure itemscope="" itemtype="http://schema.stenci.la/Figure" id="fig3" title="Figure 3.">
          <label data-itemprop="label">Figure 3.</label><img
            src="index.html.media/elife-63853-fig3-v2.jpg" alt="" itemscope=""
            itemtype="http://schema.org/ImageObject">
          <figcaption>
            <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
              id="comparison-of-tweetynet-with-a-support-vector-machine-svm-model">Comparison of
              TweetyNet with a support vector machine (SVM) model.</h4>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Plots show syllable error
              rate (y axis) as a function of training set size (x axis, size of training set in
              seconds). Syllable error rate is an edit distance computed on sequences of text
              labels. Here it is measured on a fixed, held-out test set (never seen by the model
              during training). Hues correspond to model type: TweetyNet neural network (blue) or
              SVM (orange). Shaded areas around lines indicate the standard deviation across song of
              individual birds, and across model training replicates (each trained with different
              subsets of data randomly drawn from a total training set, n = 4 Bengalese finches, 10
              replicates per bird;n = 3 canaries, 7 replicates per bird). Line style indicates input
              to model: spectrogram (solid line), or segmented audio, processed in three different
              ways, either manually cleaned by human annotators (dashed), not cleaned at all
              (dotted), or cleaned with a semi-automatic approach (dot-dash).</p>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Data used to generate line
              plots.</p>
          </figcaption>
        </figure>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">This estimate of syllable error
          rate for the SVM may seem overly pessimistic. For example, an expert human annotator could
          remove any non-song noises between song bouts fairly efficiently. To mimic this simple
          cleaning step, we removed any segments in the re-segmented audio that did not occur
          between the first onset and the last offset in the manually annotated, ground truth data.
          Our intent was to remove most of the noises that a human annotator could rapidly identify,
          while leaving any mis-segmented syllables that the annotator would need to carefully
          adjust by hand. In this setting, with semi-automated clean-up of the segments, the SVM
          also had a higher syllable error rate than TweetyNet across all canaries, for all training
          set sizes (compare dash-dotted orange line in <a href="#fig3" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 3</a> with solid blue lines). For
          Bengalese finches, syllable error rate of the SVM started out lower than TweetyNet, but
          with 10 min of training data, error for TweetyNet was lower, and this difference was
          statistically significant (p &lt; 0.001, Wilcoxon signed-rank test). This result indicates
          that much of the increased syllable error rate can be attributed to imperfect segmenting
          of the true syllables and other noises that take place during song bouts.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">We did observe that SVM models
          could actually achieve a very low syllable error rate, when provided with audio segments
          that have been manually cleaned by human annotators. SVM predictions on this perfectly
          clean data are lower than the syllable error rate of TweetyNet. For models trained with 10
          min of data, this difference was again significant (p &lt; 0.05, Wilcoxon signed-rank
          test). However, if applying a machine learning model required human annotators to manually
          clean the segments produced from audio by the standard algorithm, it would defeat the
          purpose of automating annotation.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Lastly, we observed that there
          was a much higher standard deviation in error rate, computed across individuals and
          training replicates, for SVM models predicting labels for uncleaned or semi-cleaned
          segmented audio when compared with TweetyNet (p &lt; 0.001, Levene’s test). The standard
          deviation is indicated by the shaded areas in <a href="#fig3" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 3</a>. This results shows that TweetyNet
          performs well across random samples of each bird’s song, because each replicate was
          trained on a randomly drawn subset from a larger pool of training data. Our software
          ensured that at least one instance of each syllable class was present in those subsets
          (please see ’Learning curves’ for details). This result suggests that experimenters will
          not need to carefully construct training sets of data to fit TweetyNet models, as long as
          they ensure that training sets contain a minimum number of instances of each syllable
          class.</p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="tweetynet-annotates-with-low-error-rates-across-individuals-and-species">Tweetynet
          annotates with low error rates across individuals and species</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">The third criterion we set out
          above is that our model should be capable of learning the unique song of each individual.
          Here we show that this criterion is met by our method achieving low error across
          individuals and across species. To show this, we carried out further experiments, adding
          song from an additional four Bengalese finches from the dataset accompanying <cite
            itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib44"><span>44</span><span>Koumura</span><span>2016</span></a></cite> (see
          ‘Annotation of Bengalese finch song’ in Materials and methods for details). This gave us a
          dataset of song from 8 Bengalese finches recorded and annotated in two different research
          groups. In <a href="#fig4" itemscope="" itemtype="http://schema.stenci.la/Link">Figure
            4</a>, we show learning curves for the 8 Bengalese finches and the three canaries, this
          time plotting lines for each individual, to better understand how the model performs for
          each bird’s unique song. Here, we consider the syllable error rate as defined above, and
          in addition the frame error, which is the fraction of time bins classified incorrectly,
          displayed as a percent. Results here and in the next section will demonstrate why it is
          important to measure both the frame error and the syllable error rate. Across all 8
          Bengalese finches, the mean syllable error rate obtained by TweetyNet was 0.9%, and the
          mean frame error was 1.56%. It can be seen that the model performed well across most birds
          and training replicates, although for two birds the syllable error rate exhibited a
          relatively high standard deviation for training sets of size 75 s or less. Across all
          individuals, it appeared that 8–10 min worth of manually annotated data was the minimal
          amount needed to train models achieving the lowest observed syllable error rates. For
          canaries, with 11 min of training data, the mean syllable error rate was 5.17%, and the
          mean frame error was 2.91%. It was unclear from the learning curves for canaries whether
          the syllable error rate of TweetyNet had reached an asymptotic value at the largest
          training set size. Because training models on canary song could be computationally
          expensive, we did not include larger data sets for these curves. To obtain an estimate of
          the asymptotic syllable error rate, for each bird we trained one replicate on a single
          dataset of 60 minutes of song (instead of training multiple replicates with randomly drawn
          subsets of the training data). This produced an estimated asymptotic mean syllable error
          rate of 3.1(± 0.2)% for TweetyNet on canary song. Taken together, these benchmarking
          results suggest that the syllable error rate of TweetyNet is low enough to enable
          automated annotation of large-scale datasets from behavioral experiments. We show this
          rigorously below, but first we interrogate more closely how the model achieves this
          performance.</p>
        <figure itemscope="" itemtype="http://schema.stenci.la/Figure" id="fig4" title="Figure 4.">
          <label data-itemprop="label">Figure 4.</label><img
            src="index.html.media/elife-63853-fig4-v2.jpg" alt="" itemscope=""
            itemtype="http://schema.org/ImageObject">
          <figcaption>
            <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
              id="performance-of-tweetynet-across-songs-of-8-bengalese-finches-and-three-canaries">
              Performance of TweetyNet across songs of 8 Bengalese finches and three canaries.</h4>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Plots show frame error (y
              axis, top row) and syllable error rate (y axis, bottom row) as a function of training
              set size (x axis, in seconds). Frame error is simple accuracy of labels the network
              predicted for each time bin in spectrograms, while syllable error rate is an edit
              distance computed on sequences of labels for the segments that we recover from the
              vectors of labeled time bins (as described in main text). Thick line is mean across
              all individuals, thinner lines with different styles correspond to individual birds
              (each having a unique song). Shaded areas around lines for each bird indicate standard
              deviation of metric plotted across multiple training replicates, each using a
              different randomly-drawn subset of the training data. Metrics are computed on a fixed
              test set held constant across training replicates. Here hue indicates species (as in
              <a href="#fig5" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 5A</a>
              below): Bengalese finches (magenta, left column) and canaries (dark gray, right
              column).</p>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Data used to generate plots
              for Bengalese finches.Data used to generate plots for canaries.</p>
          </figcaption>
        </figure>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="simple-post-processing-greatly-reduces-syllable-error-rates">Simple post-processing
          greatly reduces syllable error rates</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">One of our criteria for an
          automated annotation method was that it should only require training a single model.
          Although our approach meets this criterion, there are of course hyperparameters for
          training the model that we tuned during our experiments, and there is additional
          post-processing applied to the model outputs when converting them to annotations. (The
          term ‘hyperparameter’ refers to parameters that configure the model, such as the batch
          size during training or the size of the spectrogram windows, as opposed to the parameters
          in the model itself, optimized by training.) Here, we take a closer look at how
          post-processing and hyperparameters impact performance, to understand how TweetyNet works
          ‘under the hood’, and to provide a starting point for users applying the model to their
          own data.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">As described above, the
          post-processing consists of first discarding any segments shorter than a minimum duration,
          and then taking a ‘majority vote’ within any consecutive run of labels between time bins
          labeled as ‘background’. To understand how this impacts performance, we computed frame
          error and syllable error rate with and without post-processing, as shown in <a
            href="#fig5" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 5</a>. We found
          that post-processing had little effect on the frame error (compare dashed and solid lines
          in <a href="#fig5" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 5A</a> top
          row), but that it greatly reduced the syllable error rates (bottom row). To understand
          this difference, we performed further analysis. We found that many of the frame errors
          could be attributed to disagreements between the model predictions and the ground truth
          annotations about the onset and offset times of syllables (see <a href="#fig5s1"
            itemscope="" itemtype="http://schema.stenci.la/Link">Figure 5—figure supplement 1</a>).
          These syllable boundaries are naturally variable in the ground truth data, but such
          mismatches between the model predictions and the ground truth do not change the label
          assigned to a segment, and thus do not contribute to the syllable error rate. We also
          asked whether the increased syllable error rate might be due to errors that result when
          the model sees sequences of syllables that occur with very low probability. We were unable
          to find strong evidence that these infrequently-seen sequences caused the model to make
          errors. Rarely occurring sequences had little effect even when we limited the performance
          of our model by shrinking the size of the hidden state in the recurrent layer (see <a
            href="#fig5s2" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 5—figure
            supplement 2</a>). The results of this further analysis and the difference we observed
          between frame error and syllable error rate suggested to us that our post-processing
          corrects a small number of mislabeled frames peppered throughout the network outputs,
          which has a comparatively large effect on the syllable error rate.</p>
        <figure itemscope="" itemtype="http://schema.stenci.la/Figure" id="fig5" title="Figure 5.">
          <label data-itemprop="label">Figure 5.</label><img
            src="index.html.media/elife-63853-fig5-v2.jpg" alt="" itemscope=""
            itemtype="http://schema.org/ImageObject">
          <figcaption>
            <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
              id="the-effect-of-post-processing-and-hyperparameters-on-tweetynet-performance">The
              effect of post-processing and hyperparameters on TweetyNet performance.</h4>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">(<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">A</strong>) Mean frame error (top row) and
              mean syllable error rate, across all birds and training replicates, as a function of
              training set duration. Hue indicates species (Bengalese finches, magenta; canaries,
              dark gray). Line style indicates whether the metric was computed with (solid lines) or
              without (dashed lines) post-processing of the vectors of labeled time bins that
              TweetyNet produces as output. (Note solid lines are same data as <a href="#fig4"
                itemscope="" itemtype="http://schema.stenci.la/Link">Figure 4</a>). (<strong
                itemscope="" itemtype="http://schema.stenci.la/Strong">B, C</strong>). Performance
              for a range of values for two key hyperparameters: the size of windows from
              spectrograms shown to the network (<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">B</strong>) and the size of the hidden
              state in the recurrent layer (<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">C</strong>). Box-and-whisker plots show
              metrics computed at the maximum training set duration we used for the curves in A
              (‘Max. train dur.’, black arrow in A). We chose the maximum training set durations
              because at those metrics were closest to the asymptotic minimum approached by the
              learning curves. Top row of axes in both B and C shows frame error, and bottom row of
              axes shows syllable error rate. Blue boxes are metrics computed with post-processing
              transforms applied, orange boxes are error rates without those transforms. Ticks
              labels in boldface on axes in B and C represent the hyperparameters we used for
              results shown in A, and <a href="#fig3" itemscope=""
                itemtype="http://schema.stenci.la/Link">Figures 3</a> and <a href="#fig4"
                itemscope="" itemtype="http://schema.stenci.la/Link"><span
                  data-itemtype="http://schema.org/Number">4</span></a>.</p>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Data used to generate line
              plots in Figure 5A, B, Figure 5—figure supplement 3.Data used to generate box plots in
              Figure 5B, C, Figure 5—figure supplement 3.</p>
          </figcaption>
        </figure>
        <figure itemscope="" itemtype="http://schema.stenci.la/Figure" id="fig5s1"
          title="Figure 5—figure supplement 1."><label data-itemprop="label">Figure 5—figure
            supplement 1.</label><img src="index.html.media/elife-63853-fig5-figsupp1-v2.jpg" alt=""
            itemscope="" itemtype="http://schema.org/ImageObject">
          <figcaption>
            <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
              id="most-frame-errors-of-trained-tweetynet-models-are-disagreement-on-syllable-boundaries-of-02-time-bins">
              Most frame errors of trained TweetyNet models are disagreement on syllable boundaries
              of 0–2 time bins.</h4>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Potential syllable boundary
              disagreements are time bins in which the ground truth test set or the trained
              TweetyNet model disagree and just one of them assigns the ’background’ label. The
              histograms show the distances of those time bins from the nearest syllable boundary in
              test sets 5000 second long.</p>
          </figcaption>
        </figure>
        <figure itemscope="" itemtype="http://schema.stenci.la/Figure" id="fig5s2"
          title="Figure 5—figure supplement 2."><label data-itemprop="label">Figure 5—figure
            supplement 2.</label><img src="index.html.media/elife-63853-fig5-figsupp2-v2.jpg" alt=""
            itemscope="" itemtype="http://schema.org/ImageObject">
          <figcaption>
            <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
              id="frame-errors-in-rarely-occurring-bengalese-finch-sequences">frame errors in
              rarely-occurring Bengalese finch sequences.</h4>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Each dot represents a
              syllable sequence a-b-<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">y</strong>. The x-axis shows the ratio
              between the frequency of a-b-<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">y</strong> and the frequency of the most
              common sequence a-b-<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">x</strong> (p(rare) and p(frequent)
              respectively). The y-axis shows the frame error measured in the segments <strong
                itemscope="" itemtype="http://schema.stenci.la/Strong">y</strong> occurring in the
              context a-b-<strong itemscope="" itemtype="http://schema.stenci.la/Strong">y</strong>.
              (<strong itemscope="" itemtype="http://schema.stenci.la/Strong">A</strong>) TweetyNet
              models with very small hidden state sizes will have large error rates in some, but not
              all, of the rarely occurring events. This is seen in the negative Pearson correlation
              (top panel, r values) between the error rate (y-axis) and the relative rarity of the
              sequence (x-axis) and in larger fractions of above-median error rates in the more rare
              events compared to the more common among the data in the top panel (Bars, bottom
              panel. Error bars showing S.E.). These effects are almost never significant (Pearson
              r,p in the top panels and the binomial z-test and p values in the bottom panels) and
              decrease if applying the majority vote transform (orange dots) or when training the
              networks with more data (left to right panels showing training sets 30–600 s long).
              Note these results are from networks trained with a hidden state size of 16. For main
              results in <a href="#fig5" itemscope=""
                itemtype="http://schema.stenci.la/Link">Figures 5A</a> and <a href="#fig6"
                itemscope="" itemtype="http://schema.stenci.la/Link"><span
                  data-itemtype="http://schema.org/Number">6</span></a> we used a size of 256.
              (<strong itemscope="" itemtype="http://schema.stenci.la/Strong">B</strong>) Repeats A
              but with hidden state size of 64, and showing an even smaller effect.</p>
          </figcaption>
        </figure>
        <figure itemscope="" itemtype="http://schema.stenci.la/Figure" id="fig5s3"
          title="Figure 5—figure supplement 3."><label data-itemprop="label">Figure 5—figure
            supplement 3.</label><img src="index.html.media/elife-63853-fig5-figsupp3-v2.jpg" alt=""
            itemscope="" itemtype="http://schema.org/ImageObject">
          <figcaption>
            <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
              id="filter-size-experiments">Filter size experiments.</h4>
          </figcaption>
        </figure>
        <figure itemscope="" itemtype="http://schema.stenci.la/Figure" id="fig5s4"
          title="Figure 5—figure supplement 4."><label data-itemprop="label">Figure 5—figure
            supplement 4.</label><img src="index.html.media/elife-63853-fig5-figsupp4-v2.jpg" alt=""
            itemscope="" itemtype="http://schema.org/ImageObject">
          <figcaption>
            <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
              id="filter-number-experiments">Filter number experiments.</h4>
          </figcaption>
        </figure>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Next we sought to understand
          how the hyperparameters used during training affected the small number of incorrect frames
          that inflate the syllable error rate. We focused on two key hyperparameters we considered
          most likely to affect syllable error rate: the size of windows from spectrograms shown to
          the network (measured in the number of time bins), and the size of the hidden state in the
          recurrent layer. The window size determines the context the network sees, while the hidden
          state size determines the network’s capacity to integrate contextual information across
          time steps. We ran further experiments using a range of values for both hyperparameters to
          determine how they impact performance. In all cases, we saw that both hyperparameters had
          little effect on frame error (top row in <a href="#fig5" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 5B and C</a>) but a large effect on
          syllable error rate (bottom row in <a href="#fig5" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 5B and C</a>). This difference between
          metrics is again consistent with the idea that the main contributor to the syllable error
          rate is a handful of frame errors scattered across the network outputs. These experiments
          also confirmed that the values we chose to obtain results in <a href="#fig3" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figures 3</a> and <a href="#fig4" itemscope=""
            itemtype="http://schema.stenci.la/Link"><span
              data-itemtype="http://schema.org/Number">4</span></a> were close to optimal; smaller
          values would have negatively impacted performance, and larger values would have yielded
          little or no additional gain. (There is no widely-accepted method to find truly optimal
          hyperparameters.) In all cases, the effect of these hyperparameters was clear when looking
          at the model outputs before post-processing (orange boxes in <a href="#fig5" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 5B and C</a>). We did not see any similar
          effect when testing other hyperparameters such as filter size (<a href="#fig5s2"
            itemscope="" itemtype="http://schema.stenci.la/Link">Figure 5—figure supplement 2</a>)
          and number (<a href="#fig5s3" itemscope="" itemtype="http://schema.stenci.la/Link">Figure
            5—figure supplement 3</a>). In total, these results show that our algorithm is in fact
          learning something about the sequences, by leveraging context from the windows that it
          sees and by storing information it propagates across time steps in its hidden state.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">However, our simple
          post-processing step had a much larger effect on both error metrics, making the impact of
          hyperparameters difficult to see when plotted at the same scale (blue boxes in <a
            href="#fig5" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 5B and C</a>).
          Therefore, the results also demonstrate that even with well-chosen hyperparameters the
          network outputs contain segmenting errors that our post-processing removes. In all cases,
          we were able to reduce the syllable error rate by nearly an order of magnitude with
          post-processing. We return to this point in the discussion.</p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="birdsong-annotated-automatically-with-tweetynet-replicates-key-behavioral-findings">
          Birdsong annotated automatically with TweetyNet replicates key behavioral findings</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">We next assessed performance of
          TweetyNet in a scenario more similar to how an experimentalist would apply our approach to
          their data. Specifically, we asked whether we could replicate key findings from previous
          behavioral experiments, using annotations predicted by TweetyNet.</p>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="tweetynet-annotation-of-bengalese-finch-song-replicates-statistics-of-branch-points">
          TweetyNet annotation of Bengalese finch song replicates statistics of branch points</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Bengalese finch song is known
          to contain <em itemscope="" itemtype="http://schema.stenci.la/Emphasis">branch
            points</em>, where one class of syllable can transition to two or more other classes. An
          example is shown in <a href="#fig6" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 6A and B</a>. <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib96"><span>96</span><span>Warren et
                al.</span><span>2012</span></a></cite> showed that these transition probabilities
          are stable across many days. We asked if we could replicate this result with automated
          annotation for several full days of recordings, predicted by TweetyNet models trained on a
          relatively small set of manually-annotated songs. To do so, we used the dataset from <cite
            itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib58"><span>58</span><span>Nicholson et
                al.</span><span>2017</span></a></cite>, that contains recordings from 4 Bengalese
          finches, whose every song was manually annotated across 3–4 days. We verified that in the
          ground truth annotations from <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib58"><span>58</span><span>Nicholson
                et al.</span><span>2017</span></a></cite> we could replicate the key finding from
          <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib96"><span>96</span><span>Warren et al.</span><span>2012</span></a></cite>,
          that branch point statistics were stable across several days (Bonferroni-corrected
          pairwise bootstrap test, n.s. in all cases).</p>
        <figure itemscope="" itemtype="http://schema.stenci.la/Figure" id="fig6" title="Figure 6.">
          <label data-itemprop="label">Figure 6.</label><img
            src="index.html.media/elife-63853-fig6-v2.jpg" alt="" itemscope=""
            itemtype="http://schema.org/ImageObject">
          <figcaption>
            <h5 itemscope="" itemtype="http://schema.stenci.la/Heading"
              id="replicating-results-on-branch-points-in-bengalese-finch-song-with-annotations-predicted-by-tweetynet">
              Replicating results on branch points in Bengalese finch song with annotations
              predicted by TweetyNet.</h5>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">(<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">A</strong>) Representative example of a
              Bengalese finch song with a <em itemscope=""
                itemtype="http://schema.stenci.la/Emphasis">branch point</em>: the syllable labeled
              ’e’ can transition to either ’f’, as highlighted with blue rectangles, or to ’i’, as
              highlighted with an orange rectangle. (<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">B</strong>) Transition probabilities for
              this branch point, computed from one day of song. (<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">C</strong>) Syllable error rates per day
              for each bird from <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
                  href="#bib58"><span>58</span><span>Nicholson et
                    al.</span><span>2017</span></a></cite>. Solid line is mean and shaded area is
              standard deviation across 10 training replicates. Line color and style indicate
              individual animals. TweetyNet models were trained on 10 min of manually annotated
              song, a random subset drawn from data for day 1. Then syllable error rates were
              computed for the remaining songs from day 1, and for all songs from all other days.
              (<strong itemscope="" itemtype="http://schema.stenci.la/Strong">D</strong>) Transition
              probabilities across days for the branch point in A and B, computed from the ground
              truth annotations (solid lines) and the annotations predicted by TweetyNet (dashed
              lines). Shaded area around dashed lines is standard deviation of the estimated
              probabilities, across the 10 training replicates. (<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">E</strong>) Group analysis of transition.
              x axis is probability computed from the ground truth annotations, and the y axis is
              probability estimated from the predicted annotations. Dashed line is ‘x = y’, for
              reference. Each (color, marker shape) combination represents one branch point from one
              bird.</p>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Data used to generate line
              plot in Figure 6C.Data used to generate line plot in Figure 6D.Data used to generate
              scatter plot in Figure 6E.</p>
          </figcaption>
        </figure>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Before testing whether we could
          recover this finding from annotations predicted by TweetyNet, we first measured model
          performance across entire days of song. Using models trained on 10 min (for experiments in
          <a href="#fig3" itemscope="" itemtype="http://schema.stenci.la/Link">Figures 3</a> and <a
            href="#fig4" itemscope="" itemtype="http://schema.stenci.la/Link"><span
              data-itemtype="http://schema.org/Number">4</span></a>), we predicted annotations for
          the remainder of the songs. As shown in <a href="#fig6" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 6C</a>, we found that these TweetyNet
          models maintained low syllable error rates when measured with entire days of song, without
          exhibiting large fluctuations across days. The syllable error rate ranged from 1% to 5%
          across 3–4 days of song from each of the four birds, comparable to rates observed in <a
            href="#fig4" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 4</a>. We
          emphasize that the days of songs we used as test sets here are much larger than those we
          used to benchmark models in <a href="#fig4" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 4</a>. The mean duration of these test
          sets was 1528 seconds (s.d. 888.6 s, i.e. 25 min mean, 14 min s.d.), in contrast to <a
            href="#fig4" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 4</a> where we
          measured syllable error rates with a fixed test set of 400 s (6 min 40 s).</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Next we asked whether we could
          recover the behavioral findings using annotations predicted by TweetyNet. Applying the
          same analysis from <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib96"><span>96</span><span>Warren et al.</span><span>2012</span></a></cite>,
          we found that annotations predicted by TweetyNet were statistically indistinguishable from
          the ground truth data (Bonferroni-corrected pairwise bootstrap test, again n.s. in all
          cases). This can be seen by overlaying model and ground truth predictions, as in the
          representative example in <a href="#fig6" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 6D</a>. Summary results for all branch
          points on all days in all four birds are shown in <a href="#fig6" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 6E</a>, again illustrating that the
          probabilities estimated from predicted annotations were quite similar to those estimated
          from the ground truth.</p>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="tweetynet-annotation-of-canary-song-replicates-statistical-models-of-song-structure">
          TweetyNet annotation of canary song replicates statistical models of song structure</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Canary songs consist of trills
          of repeated syllables called phrases (<a href="#fig1" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 1B</a>). <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib50"><span>50</span><span>Markowitz
                et al.</span><span>2013</span></a></cite> examined sequences of phrases of
          Waterslager canaries and found transitions with different memory depths. They showed this
          by describing probability distribution of transition outcomes from certain phrases by
          Markov chains with variable lengths. This syntax structure is captured parsimoniously by
          probabilistic suffix trees (PST) <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib72"><span>72</span><span>Ron et
                al.</span><span>1996</span></a></cite>. The root node in these graphical models,
          appearing in the middle of <a href="#fig7" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 7A and B</a>, represents the zero-order
          Markov, or base rate, frequencies of the different phrases, labeled in different colors
          and letters. Each branch, emanating from the colored letters in <a href="#fig7"
            itemscope="" itemtype="http://schema.stenci.la/Link">Figure 7</a>, represents the set of
          Markov chains that end in the specific phrase type designated by that label. For example,
          the ’A’ branch in <a href="#fig7" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 7A</a> includes the first order Markov
          model ’A’ and the second order Markov chains ’FA’ and ’1A’ representing the second order
          dependence of the transition from phrase ’A’.</p>
        <figure itemscope="" itemtype="http://schema.stenci.la/Figure" id="fig7" title="Figure 7.">
          <label data-itemprop="label">Figure 7.</label><img
            src="index.html.media/elife-63853-fig7-v2.jpg" alt="" itemscope=""
            itemtype="http://schema.org/ImageObject">
          <figcaption>
            <h5 itemscope="" itemtype="http://schema.stenci.la/Heading"
              id="replicating-and-extending-results-about-canary-syntax-dependencies-with-annotations-predicted-by-tweetynet">
              Replicating and extending results about canary syntax dependencies with annotations
              predicted by TweetyNet.</h5>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">(<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">A</strong>) Long-range order found in 600
              domestic canary songs annotated with human proof reader (methods, similar dataset size
              to <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
                  href="#bib50"><span>50</span><span>Markowitz et
                    al.</span><span>2013</span></a></cite>). Letters and colors indicate phrase
              types. Each branch terminating in a given phrase type indicates the extent to which
              song history impacts transition probabilities following that phrase. Each node
              corresponds to a phrase sequence, annotated in its title, and shows a pie chart
              representing the outgoing transition probabilities from that sequence (e.g. the pie
              ’1A’ shows the probabilities of phrases ’B’, ’C’, and ’F’ which follow the phrase
              sequence ’1→ A’). The nodes are scaled according to their frequency (legend). Nodes
              that can be grouped together (chunked as a sequence) without significantly reducing
              the power of the model are labeled with blue text. These models are built by iterative
              addition of nodes up the branch to represent longer Markov chains, or a
              transition&#39;s dependence on longer sequences of song history. A TweetyNet model was
              trained using 2.2% of 1,764 songs (9.5% of the data in A). The PST created from the
              model’s predicted annotation of the entire dataset is very similar to A (see full
              comparison in <a href="#fig7s1" itemscope=""
                itemtype="http://schema.stenci.la/Link">Figure 7—figure supplement 1</a>). Here,
              branch differences between the hand labeled and model labeld song are marked by red
              and blue dashed lines for added and missed branches. (<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">B</strong>) PST created using all 1,764
              hand labeled songs. An almost identical PST was created <em itemscope=""
                itemtype="http://schema.stenci.la/Emphasis">without</em> a human proof reader from a
              TweetyNet model trained on 19% of the data (see full comparison in <a href="#fig7s2"
                itemscope="" itemtype="http://schema.stenci.la/Link">Figure 7—figure supplement
                2</a>).</p>
          </figcaption>
        </figure>
        <figure itemscope="" itemtype="http://schema.stenci.la/Figure" id="fig7s1"
          title="Figure 7—figure supplement 1."><label data-itemprop="label">Figure 7—figure
            supplement 1.</label><img src="index.html.media/elife-63853-fig7-figsupp1-v2.jpg" alt=""
            itemscope="" itemtype="http://schema.org/ImageObject">
          <figcaption>
            <h5 itemscope="" itemtype="http://schema.stenci.la/Heading"
              id="detailed-comparison-of-syntax-structure-in-600-hand-labeled-or-tweetynet-labeled-canary-songs">
              Detailed comparison of syntax structure in 600 hand labeled or TweetyNet-labeled
              canary songs.</h5>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">We plot the full
              probabilistic suffix trees created from 600 hand labeled canary songs (<strong
                itemscope="" itemtype="http://schema.stenci.la/Strong">A</strong>) and from the
              prediction of a TweetyNet model trained on 2.2% of this bird’s song (<strong
                itemscope="" itemtype="http://schema.stenci.la/Strong">B</strong>).</p>
          </figcaption>
        </figure>
        <figure itemscope="" itemtype="http://schema.stenci.la/Figure" id="fig7s2"
          title="Figure 7—figure supplement 2."><label data-itemprop="label">Figure 7—figure
            supplement 2.</label><img src="index.html.media/elife-63853-fig7-figsupp2-v2.jpg" alt=""
            itemscope="" itemtype="http://schema.org/ImageObject">
          <figcaption>
            <h5 itemscope="" itemtype="http://schema.stenci.la/Heading"
              id="detailed-comparison-of-syntax-structure-in-1764-hand-labeled-or-tweetynet-labeled-canary-songs">
              Detailed comparison of syntax structure in 1764 hand labeled or TweetyNet-labeled
              canary songs.</h5>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Supporting <a href="#fig7"
                itemscope="" itemtype="http://schema.stenci.la/Link">Figure 7B</a>. We plot the full
              probabilistic suffix trees created from 1,764 hand labeled canary songs (<strong
                itemscope="" itemtype="http://schema.stenci.la/Strong">A</strong>) and from the
              prediction of a TweetyNet model trained on 19% of this bird’s song (<strong
                itemscope="" itemtype="http://schema.stenci.la/Strong">B</strong>). The fluctuation
              in transition probabilities accumulates in long sequences and, in this example,
              increases the minimal sequence probability included in the PST. This increase
              prevented the inclusion of the ’N’ branch in the model built on TweetyNet’s
              prediction.</p>
          </figcaption>
        </figure>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">We asked if we could replicate
          findings about canary song syntax in a different strain of canaries using a TweetyNet
          model trained on a small manually annotated dataset. <a href="#fig7" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 7</a> demonstrates that annotations
          predicted by TweetyNet had sufficient accuracy on domestic canary song to extract its
          long-range order. In these figures, we set parameters of the PST estimation algorithm to
          derive the deepest syntax structure possible without overfitting, following the approach
          of <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib50"><span>50</span><span>Markowitz et al.</span><span>2013</span></a></cite>
          that used about 600 hand-annotated songs of Waterslager canaries. In this example, using
          2.2% of the data set, about 40 songs, to train a TweetyNet model and predict the rest of
          the data reveals the deep structures shown in <a href="#fig7" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 7A</a>, comparable to using 600 hand
          annotated songs of the same bird. With more training data, Tweetynet’s accuracy improves
          as does the statistical strength of the syntax model. In <a href="#fig7" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 7B</a> a TweetyNet model was trained on
          19% of the data, about 340 songs, and predicted the rest of the data. The resulting syntax
          model can be elaborated to greater depth without overfitting. To validate this deeper
          model, we compared it with a PST fit to all 1764 songs from the same bird, manually
          annotated, and found that both PSTs were very similar (<a href="#fig7" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 7B</a>).</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">In sum, we find that TweetyNet,
          trained on a small sample of canary song, is accurate enough to automatically derive the
          deep structure that has formed the basis of recent studies <span itemscope=""
            itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib50"><span>50</span><span>Markowitz et
                  al.</span><span>2013</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib17"><span>17</span><span>Cohen et
                  al.</span><span>2020</span></a></cite></span>.</p>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="larger-data-sets-of-annotated-canary-song-add-details-and-limit-the-memory-of-the-syntax-structure">
          Larger data sets of annotated canary song add details and limit the memory of the syntax
          structure</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">The increase in syntax detail,
          presented in <a href="#fig7" itemscope="" itemtype="http://schema.stenci.la/Link">Figure
            7B</a>, is possible because more rare nodes can be added to the PST without over-fitting
          the data. Formally, the PST precision increase in larger data sets is defined by the
          decrease in minimal node frequency allowed in the process of building PST models (<a
            href="#fig8" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 8</a>), as
          measured in model cross validation (see Materials and methods). In our data set, we find
          an almost linear relation between the number of songs and this measure of precision—close
          to a tenfold precision improvement.</p>
        <figure itemscope="" itemtype="http://schema.stenci.la/Figure" id="fig8" title="Figure 8.">
          <label data-itemprop="label">Figure 8.</label><img
            src="index.html.media/elife-63853-fig8-v2.jpg" alt="" itemscope=""
            itemtype="http://schema.org/ImageObject">
          <figcaption>
            <h5 itemscope="" itemtype="http://schema.stenci.la/Heading"
              id="using-datasets-more-than-five-times-larger-than-previously-explored-increases-statistical-power-and-the-precision-of-syntax-models">
              Using datasets more than five times larger than previously explored increases
              statistical power and the precision of syntax models.</h5>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">(<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">A</strong>) Ten-fold cross validation is
              used in selection of the minimal node probability for the PSTs (x-axis). Lines show
              the mean negative log-likelihood of test set data estimated by PSTs in 10 repetitions
              (methods). Curves are calculated for datasets that are sub sampled from about 5000
              songs. Red dots show minimal values - the optimum for building the PSTs. (<strong
                itemscope="" itemtype="http://schema.stenci.la/Strong">B</strong>) The decrease in
              optimal minimal node probability (y-axis, red dots in panel A) for increasing dataset
              sizes (x-axis) is plotted in gray lines for six birds. The average across animals is
              shown in black dots and line.</p>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Data used to generate lines
              in Figure 8A.Data used to generate dots in Figure 8A.Data used to generate lines in
              Figure 8B.</p>
          </figcaption>
        </figure>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">In <a href="#fig7" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 7B</a>, this increased precision allowed
          reliably adding longer branches to the PST to represent longer Markov chains (in
          comparison to <a href="#fig7" itemscope="" itemtype="http://schema.stenci.la/Link">Figure
            7A</a>). In this example, using a dataset three times larger revealed a 5-deep branch
          that initiates with the beginning of song (’1ABGN’), suggestive of a potential global
          time-in-song dependency of that transition. The PST in <a href="#fig7" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 7B</a> also has branches that did not
          ’grow’ compared to <a href="#fig7" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 7A</a> when more songs were analyzed
          (e.g. the ’B’, ’Q’, and ’R’ branches), indicating a potential cutoff of memory depth that
          is crucial in studying the neural mechanisms of song sequence generation.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">The data sets used in <a
            href="#fig7" itemscope="" itemtype="http://schema.stenci.la/Link">Figures 7</a> and <a
            href="#fig8" itemscope="" itemtype="http://schema.stenci.la/Link"><span
              data-itemtype="http://schema.org/Number">8</span></a>, are about 10 times larger than
          previous studies. To ascertain the accuracy of the syntax models, in creating the data
          sets we manually proofread annotations predicted by TweetyNet (see ‘Annotation of canary
          song’ in Materials and methods). Across five different human proof readers, we compared
          the time required to manually annotate canary song with the proof-reading time, and found
          that using TweetyNet saved 95–97.5% of the labor.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Taken as a whole, results in
          this section show that TweetyNet makes high-throughput automated annotation of behavioral
          experiments possible, greatly reducing labor while scaling up the amount of data that can
          be analyzed by orders of magnitude.</p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="very-rare-hard-to-classify-vocal-behaviors-can-cause-tweetynet-to-introduce-errors">
          Very rare, hard-to-classify vocal behaviors can cause TweetyNet to introduce errors</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Songbird species vary in the
          degree to which the elements of their song can be categorized into a set of discrete
          classes <span itemscope="" itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib90"><span>90</span><span>Thompson
                  et al.</span><span>2012</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib76"><span>76</span><span>Sainburg
                  et al.</span><span>2020</span></a></cite></span>. Even for species where expert
          annotators can readily define such a set, there will occasionally be periods in song where
          it is unclear how to classify syllables. Here, we provide examples of these rare cases to
          illustrate how even a well-trained TweetyNet model can introduce errors in annotation when
          the behavior itself cannot be cleanly categorized. The examples we present in <a
            href="#fig9" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 9</a> are from
          canaries, simply because their song can be so highly varied. As these examples illustrate,
          predictions of TweetyNet models are well-behaved when faced with rare variants, assigning
          high probability to the most relevant labels, not to completely unrelated classes of
          syllables. We emphasize that any standard supervised machine learning model that assigns
          only a single label to each segment will be vulnerable to introducing errors like these.
          Such errors raise questions about whether and when birdsong can be categorized into
          discrete syllable classes, questions that are brought back into focus by methods like ours
          that automate the process. As we will now discuss, we see several ways in which future
          work can address these questions.</p>
        <figure itemscope="" itemtype="http://schema.stenci.la/Figure" id="fig9" title="Figure 9.">
          <label data-itemprop="label">Figure 9.</label><img
            src="index.html.media/elife-63853-fig9-v2.jpg" alt="" itemscope=""
            itemtype="http://schema.org/ImageObject">
          <figcaption>
            <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
              id="rare-variants-of-canary-song-introduce-segmentation-and-annotation-errors">Rare
              variants of canary song introduce segmentation and annotation errors.</h4>
            <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">(<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">A-E</strong>) Spectrograms on top of the
              time-aligned likelihood (gray scale) assigned by a well-trained TweetyNet model to
              each of the labels (y-axis, 30 syllable types and the tag ’<em itemscope=""
                itemtype="http://schema.stenci.la/Emphasis">Bg</em>.’ for the background segments).
              Green and red vertical lines and numbers on top of the spectrograms mark the onset,
              offset, and labels predicted by the model. (<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">A,B</strong>) Canary phrase transitions
              can contain a vocalization resembling the two flanking syllables fused together. A
              TweetyNet model trained to split this vocalization performed very well (<strong
                itemscope="" itemtype="http://schema.stenci.la/Strong">A</strong>) but failed in a
              rare variant (<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">B</strong>). The network output highlights
              a general property: TweetyNet assigned high likelihood to the same flanking syllable
              types and not to irrelevant syllables. (<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">C</strong>) Syllables produced soft, weak,
              and acoustically imprecise at the onset of some canary phrases are mostly captured
              very well by TweetyNet but, on rare occasions, can be missed. In this example the
              model assigned high likelihood to the correct label but higher to the background.
              (<strong itemscope="" itemtype="http://schema.stenci.la/Strong">D</strong>) Some human
              annotators, called &#39;splitters&#39;, define more syllable classes. Others, the
              &#39;lumpers&#39;, group acoustically-diverse vocalizations under the same label.
              TweetyNet models trained on acoustically-close classes assign high likelihood to both
              labels and, on rare occasions, flip between them. This example demonstrates that
              TweetyNet does not use the a-priori knowledge of syllable repeats
              hierarchically-forming canary phrases. (<strong itemscope=""
                itemtype="http://schema.stenci.la/Strong">E</strong>) Canaries can simultaneously
              produce two notes from their two bronchi. This occurs in phrase transitions and the
              spectrogram of the resulting vocalization resembles an overlay of flanking syllables.
              While the network output shows high likelihood for both syllables the algorithm is
              forced to choose just one.</p>
          </figcaption>
        </figure>
        <h2 itemscope="" itemtype="http://schema.stenci.la/Heading" id="discussion">Discussion</h2>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Annotating birdsong at the
          level of syllables makes it possible to answer questions about the syntax governing this
          learned sequential behavior <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib9"><span>9</span><span>Berwick et al.</span><span>2011</span></a></cite>.
          Annotating syllables also makes it possible describe them in physical units like pitch and
          amplitude that researchers can directly link to muscular and neural activity <span
            itemscope="" itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib81"><span>81</span><span>Sober et
                  al.</span><span>2008</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib100"><span>100</span><span>Wohlgemuth et
                  al.</span><span>2010</span></a></cite></span>. However, for many species of
          songbirds, analyses at the syllable level still require labor-intensive, time-consuming
          manual annotation. There is a clear need for a method that can automate annotation across
          individuals and species, without requiring cleanly segmented audio, and without requiring
          researchers to carefully tune and validate multiple statistical models. To meet this need,
          we developed a neural network, TweetyNet (<a href="#fig2" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 2</a>): a single model trained end-to-end
          that learns directly from spectrograms how to automatically annotate each bird’s unique
          song. TweetyNet is deliberately designed to avoid dividing annotation into separate steps
          of segmentation and labeling, and it leverages the strengths of deep learning models to
          learn features for classification from the training data, instead of relying on
          pre-defined engineered features. We showed that our approach mitigates issues that result
          from the assumption that audio can be cleanly segmented into syllables (<a href="#fig3"
            itemscope="" itemtype="http://schema.stenci.la/Link">Figure 3</a>). TweetyNet performs
          comparably to a carefully tuned Support Vector Machine model operating on pre-defined
          features extracted from manually cleaned, perfectly segmented audio (<a href="#fig3"
            itemscope="" itemtype="http://schema.stenci.la/Link">Figure 3</a>). This result might
          suggest that an alternative to our approach would be to improve the audio segmentation
          step (e.g. with an alternative algorithm <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a
              href="#bib88"><span>88</span><span>Tchernichovski et
                al.</span><span>2000</span></a></cite>) and to use a state-of-the-art non-neural
          network model (such as XGBoost <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib15"><span>15</span><span>Chen and
                Guestrin</span><span>2016</span></a></cite>). Because such approaches lack the
          flexibility and expressiveness of deep learning models, we believe they will still require
          additional tuning our method avoids. For example, one could add classes for background
          noise to such models, but this would likely require additional steps to deal with class
          imbalance. Our model and problem formulation <em itemscope=""
            itemtype="http://schema.stenci.la/Emphasis">requires</em> adding an additional
          ‘background’ class, which results in a more general solution (in much the same way that
          object detection models require and benefit from the addition of a background class <cite
            itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib78"><span>78</span><span>Scheirer et
                al.</span><span>2013</span></a></cite>).</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">A natural question raised by
          our results is how TweetyNet integrates the local acoustic structure within a few time
          bins of a spectrogram and the global structure of syllable sequences within spectrogram
          windows. To answer this, and gain a deeper understanding of our approach, we carried out
          experiments varying two key hyperparameters. The first was the size of spectrogram windows
          shown to the network, which determines how much context the network sees, and the second
          was the size of the hidden state in the recurrent layer, which determines the network’s
          capacity to integrate information across time steps. The experiments demonstrated that
          TweetyNet performance depends on well-chosen values for both hyperparameters (<a
            href="#fig5" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 5</a>). These
          empirical results support the idea that TweetyNet learns to recognize local features seen
          in syllables <em itemscope="" itemtype="http://schema.stenci.la/Emphasis">and</em> learns
          to leverage sequential information across a spectrogram window.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">In addition, we showed our
          approach performs well across multiple individuals from two different species of songbird,
          Bengalese finches and canaries (<a href="#fig4" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 4</a>). We demonstrated that using
          automated annotations produced by TweetyNet, we could replicate key findings from
          long-term behavioral experiments about the syntax of Bengalese finch (<a href="#fig6"
            itemscope="" itemtype="http://schema.stenci.la/Link">Figure 6</a>) and canary song (<a
            href="#fig7" itemscope="" itemtype="http://schema.stenci.la/Link">Figures 7</a> and <a
            href="#fig8" itemscope="" itemtype="http://schema.stenci.la/Link"><span
              data-itemtype="http://schema.org/Number">8</span></a>). Overall, these results show
          that our deep learning-based approach offers a robust, general method for automated
          annotation of birdsong at the syllable level.</p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading" id="ideas-and-speculation">Ideas
          and speculation</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Our results open up avenues for
          future research in two directions: development of neural network algorithms, and
          applications of those algorithms, once developed. Regarding algorithm development, we
          suggest that future work should test whether networks can be trained to better learn to
          segment <em itemscope="" itemtype="http://schema.stenci.la/Emphasis">without</em>
          post-processing. The experiments in <a href="#fig5" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 5</a> suggested that the post-processing
          we applied improves performance by correcting a small number of mislabeled time bins that
          cause a large increase in syllable error rate. From this, and from previous related work
          <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib47"><span>47</span><span>Lea et al.</span><span>2017</span></a></cite>, our
          sense is that a logical next step will be to incorporate the syllable error rate into the
          loss function, minimizing it directly. This would require some modifications to our
          approach, but may prove more effective than testing different network architectures.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Another important question for
          future work is: when it is appropriate to apply supervised learning algorithms to
          vocalizations, like ours and related object detection-based models <span itemscope=""
            itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib16"><span>16</span><span>Coffey
                  et al.</span><span>2019</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib24"><span>24</span><span>Fonseca
                  et al.</span><span>2021</span></a></cite></span>, and when should these algorithms
          be combined or even replaced with unsupervised algorithms. Recently developed unsupervised
          models learn helpful, often simpler, representations of birdsong and other animal
          vocalizations <span itemscope="" itemtype="http://schema.stenci.la/CiteGroup"><cite
              itemscope="" itemtype="http://schema.stenci.la/Cite"><a
                href="#bib26"><span>26</span><span>Goffinet et
                  al.</span><span>2021</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib75"><span>75</span><span>Sainburg
                  et al.</span><span>2019</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib76"><span>76</span><span>Sainburg
                  et al.</span><span>2020</span></a></cite></span>. These advances and the
          advantages of methods like TweetyNet are not mutually exclusive, and can be integrated in
          different ways depending on the needs of researchers. For example, a TweetyNet model can
          serve as a front-end that finds and classifies segments, which are then passed to an
          unsupervised model. In addition to annotating syllables, we suggest future work consider
          two other levels of classification. The first would use TweetyNet to segment audio into
          just two classes: ‘vocalization’ and ‘non-vocalization’ periods. Treating segmentation as
          a binary classification problem in this way would make it possible to extend our approach
          to vocalizations that are not easily categorized into discrete labels: juvenile birdsong,
          bat calls <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib71"><span>71</span><span>Prat et al.</span><span>2017</span></a></cite>, and
          rodent USVs <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib86"><span>86</span><span>Tachibana et
                al.</span><span>2020</span></a></cite>, for example. Another level of classification
          consists of automatically annotating higher-level structures in song such as motifs,
          phrases <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib50"><span>50</span><span>Markowitz et al.</span><span>2013</span></a></cite>
          or chunks <span itemscope="" itemtype="http://schema.stenci.la/CiteGroup"><cite
              itemscope="" itemtype="http://schema.stenci.la/Cite"><a
                href="#bib87"><span>87</span><span>Takahasi et
                  al.</span><span>2010</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib39"><span>39</span><span>Kakishita et
                  al.</span><span>2008</span></a></cite></span>. TweetyNet could annotate these
          explicitly defined higher level structures, that would then be passed to downstream
          unsupervised models designed for tasks like similarity measurement (e.g. <cite
            itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib26"><span>26</span><span>Goffinet et al.</span><span>2021</span></a></cite>;
          <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib75"><span>75</span><span>Sainburg et al.</span><span>2019</span></a></cite>,
          <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib76"><span>76</span><span>Sainburg et
                al.</span><span>2020</span></a></cite>). A second way that supervised and
          unsupervised algorithms could be combined would be to reverse the order, and use the
          unsupervised model as a front end. For example, models like those of <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib76"><span>76</span><span>Sainburg
                et al.</span><span>2020</span></a></cite> could be used to automatically generate a
          candidate set of syllable classes from a relatively small dataset of cleanly segmented
          song. A researcher would visually inspect and validate these candidate classes, and once
          validated, use them with TweetyNet to bootstrap annotation of a much larger dataset.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Lastly, we speculate on use of
          trained TweetyNet models to measure uncertainty and similarity. These measures can be
          estimated using either the probabilities that TweetyNet produces as outputs, or with
          so-called ‘activations’ within layers of the network that are elicited by feeding inputs
          through it. The output probabilities can serve as a metric in syllable space. For example,
          when predicting new annotations, researchers could use output probabilities from TweetyNet
          to flag less confident predictions for subsequent human inspection and post-processing. As
          shown in <a href="#fig9" itemscope="" itemtype="http://schema.stenci.la/Link">Figure
            9D</a>, this approach can highlight rare song variants and may also help annotators
          identify edge cases where they have defined syllable classes that are too similar to each
          other. More generally, a researcher could use a TweetyNet model trained on a single
          adult’s song to obtain an estimate of any other song’s similarity to it, such as the
          adult’s offspring or even recordings of the same adult’s song as a juvenile. This could be
          done using the output probabilities, or activations within the network. Activations in
          trained TweetyNet models could also be used to assess the output of unsupervised models
          that generate vocalizations <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib75"><span>75</span><span>Sainburg et al.</span><span>2019</span></a></cite>,
          analogous to similar approaches in computer vision <span itemscope=""
            itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib77"><span>77</span><span>Salimans
                  et al.</span><span>2016</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib35"><span>35</span><span>Heusel
                  et al.</span><span>2017</span></a></cite></span>.</p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading" id="conclusion">Conclusion</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">The family of songbirds that
          learns by imitation consists of over 4500 species. Some of these singers, such as the
          canary, produce songs that are much too complex to be automatically annotated with
          existing methods, and for these complex singers little is known about the syntax structure
          and organization of song. The results we present suggest that our approach makes automated
          syllable-level annotation for many of these species possible. By sharing trained models,
          tutorials, data, and a library for benchmarking models, we also establish a strong
          baseline for work building upon ours. We are confident our method enables songbird
          researchers to automate annotation of very large datasets of entire bouts of song,
          required for analyses that address central questions of sensorimotor learning.</p>
        <h2 itemscope="" itemtype="http://schema.stenci.la/Heading" id="materials-and-methods">
          Materials and methods</h2>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading" id="data-preparation">Data
          preparation</h3>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="segmenting-audio-files-into-syllables">Segmenting audio files into syllables</h4>
        <h5 itemscope="" itemtype="http://schema.stenci.la/Heading" id="algorithm">Algorithm</h5>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">For Bengalese finch song, we
          applied a widely-used simple algorithm to segment audio into syllables, as described in
          the Introduction and shown in <a href="#fig1" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 1A</a>. The first step of this algorithm
          consists of finding all periods (colored line segments, middle and bottom axes of <a
            href="#fig1" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 1A</a>) where
          the amplitude of song stays above some threshold (dashed horizontal line on bottom axes of
          <a href="#fig1" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 1A</a>). The
          resulting segments are further cleaned up using two more parameters. Any periods between
          segments that are shorter than a minimum silent interval are removed, merging any
          syllables neighboring those intervals, and then finally any remaining segments shorter
          than a minimum syllable duration are removed. We used the implementation of this audio
          segmenting algorithm in the evfuncs tool <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a
              href="#bib62"><span>62</span><span>Nicholson</span><span>2021</span></a></cite>, that
          correctly replicates segmentation of the <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib58"><span>58</span><span>Nicholson
                et al.</span><span>2017</span></a></cite> dataset, which was segmented using Matlab
          code developed for previous papers (see for example <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib92"><span>92</span><span>Tumer and
                Brainard</span><span>2007</span></a></cite>).</p>
        <h5 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="estimating-segmenting-parameters-for-canary-song">Estimating segmenting parameters for
          canary song</h5>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">As we state in the
          Introduction, the same algorithm cannot be applied to canary song. In spite of this, we
          apply the algorithm to canary song for results in <a href="#fig3" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 3</a>, to make very clear the issues that
          would results from relying on it. To estimate parameters that would produce the least
          amount of errors when segmenting canary song with this algorithm, we wrote a script that
          found the following for each bird’s song: (1) the median amplitude at all syllable onsets
          and offsets in the ground truth data with segmentation adjusted by human annotators, (2)
          the 10th percentile of syllable durations, (3) and the 0.1th percentile of silent
          intervals between syllables. We visually inspected the distributions of these values
          extracted from all segments, with our estimated segmenting parameters superimposed, to
          validate that we would not artificially create a very large number of errors by using the
          parameters we found with this script.</p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="annotation-of-bengalese-finch-song">Annotation of Bengalese finch song</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Experiments in <a href="#fig4"
            itemscope="" itemtype="http://schema.stenci.la/Link">Figure 4</a> included song from
          four birds in the ‘BirdsongRecognition’ dataset <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a
              href="#bib44"><span>44</span><span>Koumura</span><span>2016</span></a></cite>. The
          models in the original study were designed to annotate specific sequences within song, as
          described in their methods. The goal of our model is annotate entire bouts of song. To use
          that dataset in our experiments, we needed to fully annotate all bouts of song. If we did
          not label all syllables, then our model would be unfairly penalized when it correctly
          annotated syllables that were present in the original dataset, but were not annotated. Two
          of the authors (Cohen and Nicholson) fully annotated the song from four of the birds,
          employing the same GUI application used to annotate canary song. The vast majority of
          syllables that we labeled were the low-frequency, high-entropy ‘introduction’ notes that
          occur at the beginning of some song bouts in varying numbers, that are often ignored
          during analysis of zebra finch and Bengalese finch song. For the handful of cases where
          other syllables were not labeled, we chose from among the classes present in the
          already-annotated data to assign labels to these. In some very rare cases, we found
          syllables where the category was not clear, similar to the cases we describe for canary
          song in <a href="#fig9" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 9</a>.
          We chose to assign a separate class to these and remove song bouts containing theses
          classes from both the training and test sets. As we acknowledge in <a href="#fig9"
            itemscope="" itemtype="http://schema.stenci.la/Link">Figure 9</a> Discussion, an
          inability to handle edge cases like these is a limitation of any standard supervised
          learning algorithm like ours, that operates at the level of syllables. We removed these
          cases so that we could be sure that benchmarking results accurately reflected how well the
          model performed on well-classified syllables.</p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading" id="annotation-of-canary-song">
          Annotation of canary song</h3>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="bootstrapping-annotation-with-tweetynet">Bootstrapping annotation with TweetyNet</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">In this manuscript, we used
          annotated domestic canary datasets an order of magnitude larger than previously published.
          To create these datasets we used TweetyNet followed by manual proofreading of its results.
          This process, described below, allowed ’bootstrapping’ TweetyNet’s performance. Song
          syllables were segmented and annotated in a semi-automatic process:</p>
        <ul itemscope="" itemtype="http://schema.org/ItemList">
          <li itemscope="" itemtype="http://schema.org/ListItem" itemprop="itemListElement">
            <meta itemprop="position" content="1">
            <meta itemprop="url" content="#1">A set of 100 songs was manually segmented and
            annotated using a <a
              href="https://github.com/yardencsGitHub/BirdSongBout/tree/master/helpers/GUI"
              itemscope="" itemtype="http://schema.stenci.la/Link">GUI</a> developed in-house <cite
              itemscope="" itemtype="http://schema.stenci.la/Cite"><a
                href="#bib18"><span>18</span><span>Cohen</span><span>2022</span></a></cite>. This
            set was chosen to include all potential syllable types as well as cage noises.
          </li>
          <li itemscope="" itemtype="http://schema.org/ListItem" itemprop="itemListElement">
            <meta itemprop="position" content="2">
            <meta itemprop="url" content="#2">The manually labeled set was used to train <a
              href="https://github.com/yardencsGitHub/tweetynet" itemscope=""
              itemtype="http://schema.stenci.la/Link">TweetyNet</a> <cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib64"><span>64</span><span>Nicholson</span><span>2022</span></a></cite>.
          </li>
          <li itemscope="" itemtype="http://schema.org/ListItem" itemprop="itemListElement">
            <meta itemprop="position" content="3">
            <meta itemprop="url" content="#3">In both the training phase of TweetyNet and the
            prediction phase for new annotations, data is fed to TweetyNet in segments of 1 second
            and TweetyNet&#39;s output is the most likely label for each 2.7 ms time bin in the
            recording.
          </li>
          <li itemscope="" itemtype="http://schema.org/ListItem" itemprop="itemListElement">
            <meta itemprop="position" content="4">
            <meta itemprop="url" content="#4">The trained algorithm annotated the rest of the data
            and its results were manually verified and corrected.
          </li>
        </ul>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="assuring-the-identity-and-separation-of-syllable-classes">Assuring the identity and
          separation of syllable classes</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">The manual steps in the
          pipeline described above can still miss rare syllable types or mislabel syllables into the
          wrong classes because of the human annotator’s mistake or because some annotators are more
          likely to lump or split syllable classes. To address this potential variability in
          canaries, where each bird can have as many as 50 different syllables, we made sure two
          annotators agree on the definition of the syllable classes. Then, to make sure that the
          syllable classes are well separated, all the spectrograms of every instance of every
          syllable, as segmented in the previous section, were zero-padded to the same duration. An
          outlier detection algorithm (<a
            href="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html"
            itemscope="" itemtype="http://schema.stenci.la/Link">IsolationForest</a>) was used to
          flag and re-check potential mislabeled syllables or previously unidentified syllable
          classes.</p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="segmenting-annotated-phrases-of-waterslager-canaries">Segmenting annotated phrases of
          Waterslager canaries</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">In <a href="#fig1s2"
            itemscope="" itemtype="http://schema.stenci.la/Link">Figure 1—figure supplement 2</a> we
          include data from waterslager canaries, available from a previous project in the Gardner
          lab <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib50"><span>50</span><span>Markowitz et
                al.</span><span>2013</span></a></cite>. To include this data, we needed to break
          annotated phrase segments into syllable segments. Songs were previously segmented into
          phrases, trilled repetitions of syllables, and not to individual syllables. In each
          segmented phrase, we separated vocalization and noise fluctuations between vocalizations
          by fitting a two-state hidden Markov model with Gaussian emission functions to the
          acoustic signal. Putative syllable segments produced by this procedure were proofread and
          manually corrected using a <a
            href="https://github.com/yardencsGitHub/BirdSongBout/tree/master/helpers/GUI"
            itemscope="" itemtype="http://schema.stenci.la/Link">GUI</a> developed in-house.</p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading" id="generating-spectrograms">
          Generating spectrograms</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Spectrograms were generated
          from audio files using custom Numpy (Bengalese finch) or Matlab (canary) code. For
          Bengalese finches, the code we used to generate spectrograms is built into the <a
            href="https://github.com/NickleDave/vak/blob/main/src/vak/spect.py" itemscope=""
            itemtype="http://schema.stenci.la/Link">vak library</a>. For canaries, the code we used
          to generate spectrograms can be found <a
            href="https://github.com/jmarkow/zftftb/blob/master/zftftb_pretty_sonogram.m"
            itemscope="" itemtype="http://schema.stenci.la/Link">here</a> <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a
              href="#bib52"><span>52</span><span>Markowitz</span><span>2022</span></a></cite>.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">All spectrograms for song from
          a given species were created with the same parameters, such as the number of samples in
          the window for the Fast Fourier Transform (NFFT). For Bengalese finch song, we used <span
            itemscope="" itemtype="http://schema.stenci.la/MathFragment"><span
              class="mjx-chtml"><span class="mjx-math" aria-label="\text{NFFT}=512"><span
                  class="mjx-mrow" aria-hidden="true"><span class="mjx-mtext"><span
                      class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.372em; padding-bottom: 0.372em;">NFFT</span></span><span
                    class="mjx-mo MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.077em; padding-bottom: 0.298em;">=</span></span><span
                    class="mjx-mn MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.372em; padding-bottom: 0.372em;">512</span></span></span></span></span></span>
          with a step size of 64. For canaries we used <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="\text{NFFT}=1024"><span class="mjx-mrow"
                  aria-hidden="true"><span class="mjx-mtext"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.372em; padding-bottom: 0.372em;">NFFT</span></span><span
                    class="mjx-mo MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.077em; padding-bottom: 0.298em;">=</span></span><span
                    class="mjx-mn MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.372em; padding-bottom: 0.372em;">1024</span></span></span></span></span></span>
          with a step size of 119. This produced spectrograms with a time bin size of 1ms for
          Bengalese finches, and 2.7ms for canaries.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">We chose spectrogram parameters
          such that the duration of a time bin was just smaller than the shortest duration silent
          gaps between syllables. A larger bin size would have prevented our model from producing
          correct segments, whenever one of the true silent gaps was shorter than our bin size. In
          initial studies we experimented with even smaller bin sizes, but found that the network
          tended to over-segment. Future work could compensate for this oversegmenting by modifying
          the loss function as we point out in the Discussion.</p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="neural-network-architecture-1">Neural network architecture</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Here, we provide a detailed
          description of the TweetyNet neural network architecture. The architecture that we develop
          is most directly related to those that have been used for event detection in audio and
          video <span itemscope="" itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib10"><span>10</span><span>Böck and
                  Schedl</span><span>2012</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib68"><span>68</span><span>Parascandolo et
                  al.</span><span>2016</span></a></cite></span> and for phoneme classification and
          sequence labeling <span itemscope="" itemtype="http://schema.stenci.la/CiteGroup"><cite
              itemscope="" itemtype="http://schema.stenci.la/Cite"><a
                href="#bib29"><span>29</span><span>Graves and
                  Schmidhuber</span><span>2005</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib31"><span>31</span><span>Graves
                  and Graves</span><span>2012</span></a></cite></span>. It is also somewhat similar
          to deep network models for speech recognition, but a crucial difference is that
          state-of-the-art models in that area map directly from sequences of acoustic features to
          sequences of words <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib30"><span>30</span><span>Graves et al.</span><span>2006</span></a></cite>.
          The success of these state-of-the-art models is attributed to the fact that they learn
          this mapping from speech to text, <em itemscope=""
            itemtype="http://schema.stenci.la/Emphasis">avoiding</em> the intermediate step of
          classifying each frame of audio, as has previously been shown <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib31"><span>31</span><span>Graves and
                Graves</span><span>2012</span></a></cite>. In other words, they avoid the problem of
          classifying every frame that we set out to solve.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">As described in the
          introduction, the network takes as input batches of windows from a spectrogram (c.f. top
          of <a href="#fig2" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 2</a>) and
          produces as output a tensor of size (<span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="m"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.298em;">m</span></span></span></span></span></span>
          batches <span itemscope="" itemtype="http://schema.stenci.la/MathFragment"><span
              class="mjx-chtml"><span class="mjx-math" aria-label="\times c"><span class="mjx-mrow"
                  aria-hidden="true"><span class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.225em; padding-bottom: 0.298em;">×</span></span><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.298em;">c</span></span></span></span></span></span>
          classes <span itemscope="" itemtype="http://schema.stenci.la/MathFragment"><span
              class="mjx-chtml"><span class="mjx-math" aria-label="\times t"><span class="mjx-mrow"
                  aria-hidden="true"><span class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.225em; padding-bottom: 0.298em;">×</span></span><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.372em; padding-bottom: 0.298em;">t</span></span></span></span></span></span>
          time bins). In <a href="#fig10" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 10</a> we show how networks blocks in
          that figure above relate to the shapes of tensors, and how those shapes change as the
          network maps input to output. We refer to this as we give specific parameters here. Note
          that we leave out the batch dimension in this description.</p>
        <figure itemscope="" itemtype="http://schema.stenci.la/Figure" id="fig10"
          title="Figure 10."><label data-itemprop="label">Figure 10.</label><img
            src="index.html.media/elife-63853-fig10-v2.jpg" alt="" itemscope=""
            itemtype="http://schema.org/ImageObject">
          <figcaption>
            <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
              id="tweetynet-architecture-and-tensor-shapes-resulting-from-each-operation-in-the-network">
              TweetyNet architecture and tensor shapes resulting from each operation in the network.
            </h4>
          </figcaption>
        </figure>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading" id="convolutional-blocks">
          Convolutional blocks</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">The spectrogram window passes
          through two standard convolutional blocks, each of which consists of a convolutional layer
          and a max pooling layer. We use the standard term ‘block’ to refer to a layer that
          performs several operations on its inputs. For a convolutional block, the first operation
          is the convolution operation itself (’2D conv.’ in <a href="#fig2" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figures 2</a> and <a href="#fig10" itemscope=""
            itemtype="http://schema.stenci.la/Link"><span
              data-itemtype="http://schema.org/Number">10</span></a>), as it is defined in the deep
          learning literature. This cross-correlation-like operation (asterisk in <a href="#fig2"
            itemscope="" itemtype="http://schema.stenci.la/Link">Figures 2</a> and <a href="#fig10"
            itemscope="" itemtype="http://schema.stenci.la/Link"><span
              data-itemtype="http://schema.org/Number">10</span></a>) between the spectrogram window
          and the learned filters (greyscale boxes in <a href="#fig2" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figures 2</a> and <a href="#fig10" itemscope=""
            itemtype="http://schema.stenci.la/Link"><span
              data-itemtype="http://schema.org/Number">10</span></a>) produces a set of feature
          maps. In both convolutional blocks, we use filters of size (5 × 5), with a stride of 1.
          The first block contains 32 filters and the second contains 64, as shown in <a
            href="#fig10" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 10</a>. We pad
          the feature maps so that they are the same size as the input. For a spectrogram window of
          size (176 time bins x 513 frequency bins), as we use for Bengalese finch song, this would
          produce an output of (176 × 513 x 32) from the first convolution, similar to what is shown
          in <a href="#fig10" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 10</a>.
        </p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">A key advantage of
          convolutional layers is that they enable ‘weight sharing’, that is, the relatively small
          number of parameters in each filter is applied to the input via the convolution operation,
          instead of needing to have weights for each dimension of the input (e.g. each pixel in an
          image). We used a full weight sharing scheme, meaning that each filter was
          cross-correlated with the entire input window. Previous work has tested whether
          performance on tasks related to ours, such as speech recognition, might be improved by
          alternate weight-sharing schemes, for instance by restricting filters to specific
          frequency ranges within a spectrograms. One previous study did report a benefit for this
          limited weight-sharing scheme applied to automatic speech recognition <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib1"><span>1</span><span>Abdel-Hamid
                et al.</span><span>2014</span></a></cite>. However, this approach has not been
          widely adopted, and to the best of our knowledge, the common ‘full’ weight-sharing
          approach that we employ here is used by all state-of-the-art models for speech recognition
          for example, <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib5"><span>5</span><span>Amodei et al.</span><span>2016</span></a></cite>, as
          well as the event detection models that we based our architecture on <span itemscope=""
            itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib68"><span>68</span><span>Parascandolo et
                  al.</span><span>2016</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib47"><span>47</span><span>Lea et
                  al.</span><span>2017</span></a></cite></span>.</p>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading" id="max-pooling-layer">Max
          pooling layer</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">In both convolutional blocks,
          we followed the convolutional layer by a max pooling layer. The max pooling operation
          (’Pooling’ in <a href="#fig2" itemscope="" itemtype="http://schema.stenci.la/Link">Figure
            2</a>) downsamples feature maps by sliding a window across the input (orange bin in <a
            href="#fig2" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 2</a>) and at
          each point keeping only the element with the maximum value within the window. We used a
          window size of (8 × 1) and a stride of (8, 1), with windows oriented so that the side of
          size one and the stride of size one were in the temporal dimension. Both the shape and
          stride were meant to avoid downsampling in the temporal dimension, under the assumption
          that it was important to retain this information. Applying a max pooling operation with
          these parameters to the first set of convolutional maps with size (176 × 513 x 32)
          produces an output tensor with size (176 × 64 x 32) as shown in as shown in <a
            href="#fig10" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 10</a>.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">The max pooling operation is
          widely used in networks for related tasks like automatic speech recognition and audio
          event detection. Previous work has not found any benefit to alternative pooling operations
          such as stochastic pooling <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib73"><span>73</span><span>Sainath et al.</span><span>2013</span></a></cite>
          and alternatives have not been widely adopted. To our knowledge most related work also
          adopts our approach of not down-sampling in the temporal dimension, and studies have not
          found any advantage when using larger strides in the temporal dimension <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib73"><span>73</span><span>Sainath et
                al.</span><span>2013</span></a></cite>.</p>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading" id="recurrent-layer">Recurrent
          layer</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">The output of the second
          convolutional block passes through a recurrent layer made up of LSTM units. Before passing
          it into the recurrent layer, we stack the feature maps: e.g. an output of (176 time bins x
          eight down-sampled frequency bins x 64 feature maps) becomes (176 time bins x 512
          features) (indicated as &quot;Flatten&quot; in <a href="#fig10" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 10</a>). We specifically use a
          bidirectional LSTM, meaning the recurrent network processes the input in both the forward
          and backward direction. By default we set the size of the hidden state in the network
          equal to the ‘features’ dimension, and based on experiments in <a href="#fig5"
            itemscope="" itemtype="http://schema.stenci.la/Link">Figure 5</a> this appears to be a
          reasonable default. The matrix of hidden states for all time steps become the output that
          we feed into the final layer. We adopt the standard practice of concatenating the hidden
          states from running the sequence in the forward and backward directions, doubling its
          size.</p>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading" id="linear-layer">Linear layer
        </h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">The final layer in TweetyNet is
          a linear projection (<span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="{\overrightarrow{W}}_{t,s}"><span class="mjx-mrow"
                  aria-hidden="true"><span class="mjx-msubsup"><span class="mjx-base"><span
                        class="mjx-texatom"><span class="mjx-mrow"><span
                            class="mjx-munderover"><span class="mjx-stack"><span class="mjx-over"
                                style="padding-bottom: 0.06em; padding-left: 0.093em;"><span
                                  class="mjx-mo"><span class="mjx-delim-h"><span
                                      class="mjx-char MJXc-TeX-main-R"
                                      style="padding-top: 0.222em; padding-bottom: 0.35em; margin: 0px -0.309em 0px -0.084em;"></span><span
                                      class="mjx-char MJXc-TeX-main-R"
                                      style="padding-top: 0.222em; padding-bottom: 0.35em; margin: 0px -0.056em 0px -0.281em;"></span></span></span></span><span
                                class="mjx-op"><span class="mjx-mi"><span
                                    class="mjx-char MJXc-TeX-math-I"
                                    style="padding-top: 0.446em; padding-bottom: 0.298em; padding-right: 0.104em;">W</span></span></span></span></span></span></span></span><span
                      class="mjx-sub"
                      style="font-size: 70.7%; vertical-align: -0.212em; padding-right: 0.071em;"><span
                        class="mjx-texatom" style=""><span class="mjx-mrow"><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.372em; padding-bottom: 0.298em;">t</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="margin-top: -0.144em; padding-bottom: 0.519em;">,</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">s</span></span></span></span></span></span></span></span></span></span>,
          purple matrix in <a href="#fig2" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 2</a>) of the recurrent layer’s output
          onto the different syllable classes, <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="{\displaystyle s=\mathrm{1...}n}"><span
                  class="mjx-mrow" aria-hidden="true"><span class="mjx-texatom"><span
                      class="mjx-mrow"><span class="mjx-mstyle"><span class="mjx-mrow"><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">s</span></span><span
                            class="mjx-mo MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.077em; padding-bottom: 0.298em;">=</span></span><span
                            class="mjx-texatom MJXc-space3"><span class="mjx-mrow"><span
                                class="mjx-mn"><span class="mjx-char MJXc-TeX-main-R"
                                  style="padding-top: 0.372em; padding-bottom: 0.372em;">1...</span></span></span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">n</span></span></span></span></span></span></span></span></span></span>,
          resulting in a vector of <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="n"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.298em;">n</span></span></span></span></span></span>
          syllable-similarity scores for each spectrogram time bin <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="t"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.372em; padding-bottom: 0.298em;">t</span></span></span></span></span></span>.
          The number of classes, <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="n"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.298em;">n</span></span></span></span></span></span>,
          is predetermined by the user. To segment syllables, the bin-wise syllable-similarity
          scores are first used to select a single syllable class per time bin by choosing the label
          with the highest syllable-similarity score. Since similarity scores can be normalized,
          this is akin to maximum a-posteriori (MAP) label selection. Then, the labeled time bins
          are used to separate continuous song segments from no-song segments and to annotate each
          song-segment with a single label using majority decision across time bins in that segment.
        </p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading" id="training-and-benchmarking">
          Training and benchmarking</h3>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading" id="input-data-transformations">
          Input data transformations</h4>
        <h5 itemscope="" itemtype="http://schema.stenci.la/Heading" id="windows">Windows</h5>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">As stated above, the input to
          the network consists of spectrogram windows. We used a window size of 176 time bins for
          Bengalese finch song and 370 time bins for canary song, with the exception of experiments
          testing the impact of this hyperparameter in <a href="#fig5" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 5</a>.</p>
        <h5 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="vectors-of-labeled-time-bins">Vectors of labeled time bins</h5>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">We formulate annotation as a
          supervised learning problem where each spectrogram window <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="x"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.298em;">x</span></span></span></span></span></span>
          has a corresponding vector of labeled time bins <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="y"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.519em; padding-right: 0.006em;">y</span></span></span></span></span></span>,
          and our goal is to train the network <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="f"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.519em; padding-bottom: 0.519em; padding-right: 0.06em;">f</span></span></span></span></span></span>
          to correctly map a window to this vector of labeled time bins, <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="f(x)⟶y"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.519em; padding-bottom: 0.519em; padding-right: 0.06em;">f</span></span><span
                    class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.446em; padding-bottom: 0.593em;">(</span></span><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.298em;">x</span></span><span
                    class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.446em; padding-bottom: 0.593em;">)</span></span><span
                    class="mjx-mo MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.225em; padding-bottom: 0.372em;"></span></span><span
                    class="mjx-mi MJXc-space3"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.519em; padding-right: 0.006em;">y</span></span></span></span></span></span>.
          These vectors are generated dynamically by our software from annotations consisting of
          segment labels and their onset and offset times. Each element in the vector <span
            itemscope="" itemtype="http://schema.stenci.la/MathFragment"><span
              class="mjx-chtml"><span class="mjx-math" aria-label="y"><span class="mjx-mrow"
                  aria-hidden="true"><span class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.519em; padding-right: 0.006em;">y</span></span></span></span></span></span>
          contains an integer label <em itemscope=""
            itemtype="http://schema.stenci.la/Emphasis">c</em><sub itemscope=""
            itemtype="http://schema.stenci.la/Subscript">t</sub> corresponding to the syllable class
          <span itemscope="" itemtype="http://schema.stenci.la/MathFragment"><span
              class="mjx-chtml"><span class="mjx-math" aria-label="c"><span class="mjx-mrow"
                  aria-hidden="true"><span class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.298em;">c</span></span></span></span></span></span>
          in that time bin of the spectrogram window <em itemscope=""
            itemtype="http://schema.stenci.la/Emphasis">x</em><sub itemscope=""
            itemtype="http://schema.stenci.la/Subscript">t</sub>. To this set of class labels, we
          add an additional class for the ‘background’ time bins that result naturally from gaps
          between annotated segments (’Bg.’ in <a href="#fig2" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 2</a>). This ‘background’ class will
          include brief quiet periods between syllables, as well as any other longer periods left
          unlabeled by the annotator that may contain non-song bird calls and background noises.</p>
        <h5 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="batches-of-window-labeled-time-bin-vector-pairs">Batches of (window, labeled time bin
          vector) pairs</h5>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">During training, we randomly
          grab a batch of (window, labeled time bin vector) pairs <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="(x,y)"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.446em; padding-bottom: 0.593em;">(</span></span><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.298em;">x</span></span><span
                    class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                      style="margin-top: -0.144em; padding-bottom: 0.519em;">,</span></span><span
                    class="mjx-mi MJXc-space1"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.519em; padding-right: 0.006em;">y</span></span><span
                    class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.446em; padding-bottom: 0.593em;">)</span></span></span></span></span></span>
          from all possible windows in the dataset X. To achieve this, we developed a torch Dataset
          class that represents all such possible windows paired with the corresponding vector of
          labeled timebins <span itemscope="" itemtype="http://schema.stenci.la/MathFragment"><span
              class="mjx-chtml"><span class="mjx-math" aria-label="({x}_{i},{y}_{i})"><span
                  class="mjx-mrow" aria-hidden="true"><span class="mjx-mo"><span
                      class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.446em; padding-bottom: 0.593em;">(</span></span><span
                    class="mjx-msubsup"><span class="mjx-base"><span class="mjx-texatom"><span
                          class="mjx-mrow"><span class="mjx-mi"><span
                              class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">x</span></span></span></span></span><span
                      class="mjx-sub"
                      style="font-size: 70.7%; vertical-align: -0.212em; padding-right: 0.071em;"><span
                        class="mjx-texatom" style=""><span class="mjx-mrow"><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.446em; padding-bottom: 0.298em;">i</span></span></span></span></span></span><span
                    class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                      style="margin-top: -0.144em; padding-bottom: 0.519em;">,</span></span><span
                    class="mjx-msubsup MJXc-space1"><span class="mjx-base"
                      style="margin-right: -0.006em;"><span class="mjx-texatom"><span
                          class="mjx-mrow"><span class="mjx-mi"><span
                              class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.519em; padding-right: 0.006em;">y</span></span></span></span></span><span
                      class="mjx-sub"
                      style="font-size: 70.7%; vertical-align: -0.375em; padding-right: 0.071em;"><span
                        class="mjx-texatom" style=""><span class="mjx-mrow"><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.446em; padding-bottom: 0.298em;">i</span></span></span></span></span></span><span
                    class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.446em; padding-bottom: 0.593em;">)</span></span></span></span></span></span>.
          The class tracks which windows the network has already seen during any epoch (iteration
          through the entire data set), ensuring that we avoid repeating the same windows during
          training, which could have encourage the network to memorize the training data.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">The choice to randomly present
          windows also acts as a form of data augmentation that encourages the network to exhibit
          translation invariance. That is, because the network sees very similar sequences
          repeatedly, but those sequences are randomly shifted forward or backward slightly in time,
          it learns to correctly classify all time bins in a window regardless of how the window is
          presented to the model.</p>
        <h5 itemscope="" itemtype="http://schema.stenci.la/Heading" id="normalization">Normalization
        </h5>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Normalization is a standard
          practice that improves optimization of machine learning models, but is not always
          necessary for neural networks.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">For Bengalese finch song, we
          normalized spectrograms; more precisely we standardized by finding the mean μ and standard
          deviation <span itemscope="" itemtype="http://schema.stenci.la/MathFragment"><span
              class="mjx-chtml"><span class="mjx-math" aria-label="\sigma"><span class="mjx-mrow"
                  aria-hidden="true"><span class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.298em; padding-right: 0.001em;">σ</span></span></span></span></span></span>
          of every frequency bin across all spectrograms in the training set, and then for every
          window <span itemscope="" itemtype="http://schema.stenci.la/MathFragment"><span
              class="mjx-chtml"><span class="mjx-math" aria-label="x"><span class="mjx-mrow"
                  aria-hidden="true"><span class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.298em;">x</span></span></span></span></span></span>
          we subtracted off the mean and divided by the standard deviation: <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="{x}_{\text{normalized}}=\frac{x-\mu }{\sigma }"><span
                  class="mjx-mrow" aria-hidden="true"><span class="mjx-msubsup"><span
                      class="mjx-base"><span class="mjx-texatom"><span class="mjx-mrow"><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">x</span></span></span></span></span><span
                      class="mjx-sub"
                      style="font-size: 70.7%; vertical-align: -0.219em; padding-right: 0.071em;"><span
                        class="mjx-texatom" style=""><span class="mjx-mrow"><span
                            class="mjx-mtext"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.372em; padding-bottom: 0.372em;">normalized</span></span></span></span></span></span><span
                    class="mjx-mo MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.077em; padding-bottom: 0.298em;">=</span></span><span
                    class="mjx-mfrac MJXc-space3"><span class="mjx-box MJXc-stacked"
                      style="width: 1.522em; padding: 0px 0.12em;"><span class="mjx-numerator"
                        style="font-size: 70.7%; width: 2.153em; top: -1.505em;"><span
                          class="mjx-mrow" style=""><span class="mjx-mi"><span
                              class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">x</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.298em; padding-bottom: 0.446em;"></span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.519em;">μ</span></span></span></span><span
                        class="mjx-denominator"
                        style="font-size: 70.7%; width: 2.153em; bottom: -0.524em;"><span
                          class="mjx-mi" style=""><span class="mjx-char MJXc-TeX-math-I"
                            style="padding-top: 0.225em; padding-bottom: 0.298em; padding-right: 0.001em;">σ</span></span></span><span
                        style="border-bottom: 1.3px solid; top: -0.296em; width: 1.522em;"
                        class="mjx-line"></span></span><span
                      style="height: 1.435em; vertical-align: -0.37em;"
                      class="mjx-vsize"></span></span></span></span></span></span>. Note that we
          achieved this with a SpectrogramScaler class built into the vak library rather than
          pre-processing with a script. For canary song we did not apply this normalization, and
          left the spectrograms as processed by the Matlab code referenced above. We did not
          systematically asses how normalization impacted performance.</p>
        <h5 itemscope="" itemtype="http://schema.stenci.la/Heading" id="spectrogram-thresholding">
          Spectrogram thresholding</h5>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">We did not apply any
          thresholding to spectrograms, as is often done when visualizing them to increase contrast
          between sounds of interest and often quieter background noise. In preliminary experiments,
          we did test the effect of thresholding spectrograms, setting any value of the power
          spectrum less than the specified threshold to zero. However, we found that this led to a
          slight increase in error rates, and also made training more unstable. Our best guess for
          this effect of thresholding is that it produces abrupt, large magnitude changes in values
          in the spectrogram that may affect the gradient computed during training.</p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading" id="metrics">Metrics</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">We define the metrics we use
          before describing our training methods, since our methods depend on these metrics. We
          measured performance of TweetyNet with two metrics.</p>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading" id="frame-error">Frame error
        </h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">The first is the frame error,
          that simply measures for each acoustic frame (in our case, each time bin in a spectrogram)
          whether the predicted label matches the ground truth label. Hence the range of the frame
          error is between 0 and 1, that is can be stated as a percent, and gives an intuitive
          measure of a model’s overall performance. Previous work on supervised sequence labeling,
          including bidirectional-LSTM architectures similar to ours, has used this metric <span
            itemscope="" itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib31"><span>31</span><span>Graves
                  and Graves</span><span>2012</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib29"><span>29</span><span>Graves
                  and Schmidhuber</span><span>2005</span></a></cite></span>.</p>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading" id="syllable-error-rate">
          Syllable error rate</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">The second metric we used is
          commonly called the word error rate in the speech recognition literature, and here we call
          it the syllable error rate. Because the syllable error rate is key to our results, we
          define it here, as shown in <a href="#equ1" itemscope=""
            itemtype="http://schema.stenci.la/Link">Equation 1</a>.</p><span itemscope=""
          itemtype="http://schema.stenci.la/MathBlock"><span class="mjx-chtml MJXc-display"
            style="text-align: center;"><span class="mjx-math"
              aria-label="\begin{array}{ll}{\displaystyle \text{Syllable Error rate}}&amp; =\frac{\text{Edit distance(reference sequence, predicted)}}{\text{Length(reference sequence)}}\\ {\displaystyle }&amp; =\frac{\text{Substitutions + Insertions + Deletions}}{\text{Length(reference sequence)}}\end{array}"><span
                class="mjx-mrow" aria-hidden="true"><span class="mjx-mtable"
                  style="vertical-align: -1.788em; padding: 0px 0.167em;"><span
                    class="mjx-table"><span class="mjx-mtr" style="height: 2.113em;"><span
                        class="mjx-mtd"
                        style="padding: 0px 0.5em 0px 0px; text-align: left; width: 8em;"><span
                          class="mjx-mrow" style="margin-top: 0.207em;"><span
                            class="mjx-texatom"><span class="mjx-mrow"><span
                                class="mjx-mstyle"><span class="mjx-mrow"><span
                                    class="mjx-mtext"><span class="mjx-char MJXc-TeX-main-R"
                                      style="padding-top: 0.446em; padding-bottom: 0.519em;">Syllable
                                      Error rate</span></span></span></span></span></span><span
                            class="mjx-strut"></span></span></span><span class="mjx-mtd"
                        style="padding: 0px 0px 0px 0.5em; text-align: left; width: 14.876em;"><span
                          class="mjx-mrow"><span class="mjx-mo"><span
                              class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.077em; padding-bottom: 0.298em;">=</span></span><span
                            class="mjx-mfrac MJXc-space3"><span class="mjx-box MJXc-stacked"
                              style="width: 13.581em; padding: 0px 0.12em;"><span
                                class="mjx-numerator"
                                style="font-size: 70.7%; width: 19.206em; top: -1.706em;"><span
                                  class="mjx-mtext" style=""><span class="mjx-char MJXc-TeX-main-R"
                                    style="padding-top: 0.446em; padding-bottom: 0.593em;">Edit
                                    distance(reference sequence,
                                    predicted)</span></span></span><span class="mjx-denominator"
                                style="font-size: 70.7%; width: 19.206em; bottom: -0.999em;"><span
                                  class="mjx-mtext" style=""><span class="mjx-char MJXc-TeX-main-R"
                                    style="padding-top: 0.446em; padding-bottom: 0.593em;">Length(reference
                                    sequence)</span></span></span><span
                                style="border-bottom: 1.3px solid; top: -0.296em; width: 13.581em;"
                                class="mjx-line"></span></span><span
                              style="height: 1.913em; vertical-align: -0.707em;"
                              class="mjx-vsize"></span></span><span
                            class="mjx-strut"></span></span></span></span><span class="mjx-mtr"
                      style="height: 1.963em;"><span class="mjx-mtd"
                        style="padding: 0.2em 0.5em 0px 0px; text-align: left;"><span
                          class="mjx-mrow" style="margin-top: 0.056em;"><span
                            class="mjx-texatom"><span class="mjx-mrow"><span
                                class="mjx-mstyle"><span
                                  class="mjx-mrow"></span></span></span></span><span
                            class="mjx-strut"></span></span></span><span class="mjx-mtd"
                        style="padding: 0.2em 0px 0px 0.5em; text-align: left;"><span
                          class="mjx-mrow"><span class="mjx-mo"><span
                              class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.077em; padding-bottom: 0.298em;">=</span></span><span
                            class="mjx-mfrac MJXc-space3"><span class="mjx-box MJXc-stacked"
                              style="width: 11.92em; padding: 0px 0.12em;"><span
                                class="mjx-numerator"
                                style="font-size: 70.7%; width: 16.858em; top: -1.493em;"><span
                                  class="mjx-mtext" style=""><span class="mjx-char MJXc-TeX-main-R"
                                    style="padding-top: 0.446em; padding-bottom: 0.446em;">Substitutions
                                    + Insertions + Deletions</span></span></span><span
                                class="mjx-denominator"
                                style="font-size: 70.7%; width: 16.858em; bottom: -0.999em;"><span
                                  class="mjx-mtext" style=""><span class="mjx-char MJXc-TeX-main-R"
                                    style="padding-top: 0.446em; padding-bottom: 0.593em;">Length(reference
                                    sequence)</span></span></span><span
                                style="border-bottom: 1.3px solid; top: -0.296em; width: 11.92em;"
                                class="mjx-line"></span></span><span
                              style="height: 1.763em; vertical-align: -0.707em;"
                              class="mjx-vsize"></span></span><span
                            class="mjx-strut"></span></span></span></span></span></span></span></span></span></span>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">This metric is an edit
          distance, that counts the number of edits (insertions, deletions, and substitutions)
          needed to correct a predicted sequence so it matches the ground-truth (‘reference’)
          sequence. A common algorithm used to compute the number of edits is the Levenshtein
          distance that we use here. The edit distance is normalized by the length of the ground
          truth sequence, to make it possible to compare between sequences of different lengths.</p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading" id="training">Training</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">We trained all models using the
          Adam optimizer <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib41"><span>41</span><span>Kingma and Ba</span><span>2014</span></a></cite>
          with a learning rate of 0.001, and other hyperparameters set to the defaults in the torch
          library: <span itemscope="" itemtype="http://schema.stenci.la/MathFragment"><span
              class="mjx-chtml"><span class="mjx-math"
                aria-label="{\displaystyle ({\beta }_{1},{\beta }_{2}=(0.9,0.999),ϵ=1e-08,\text{weight decay}=0.0,\text{amsgrad}=\mathtt{\text{False}})}"><span
                  class="mjx-mrow" aria-hidden="true"><span class="mjx-texatom"><span
                      class="mjx-mrow"><span class="mjx-mstyle"><span class="mjx-mrow"><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">(</span></span><span
                            class="mjx-msubsup"><span class="mjx-base"
                              style="margin-right: -0.007em;"><span class="mjx-texatom"><span
                                  class="mjx-mrow"><span class="mjx-mi"><span
                                      class="mjx-char MJXc-TeX-math-I"
                                      style="padding-top: 0.519em; padding-bottom: 0.446em; padding-right: 0.007em;">β</span></span></span></span></span><span
                              class="mjx-sub"
                              style="font-size: 70.7%; vertical-align: -0.36em; padding-right: 0.071em;"><span
                                class="mjx-texatom" style=""><span class="mjx-mrow"><span
                                    class="mjx-mn"><span class="mjx-char MJXc-TeX-main-R"
                                      style="padding-top: 0.372em; padding-bottom: 0.372em;">1</span></span></span></span></span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="margin-top: -0.144em; padding-bottom: 0.519em;">,</span></span><span
                            class="mjx-msubsup MJXc-space1"><span class="mjx-base"
                              style="margin-right: -0.007em;"><span class="mjx-texatom"><span
                                  class="mjx-mrow"><span class="mjx-mi"><span
                                      class="mjx-char MJXc-TeX-math-I"
                                      style="padding-top: 0.519em; padding-bottom: 0.446em; padding-right: 0.007em;">β</span></span></span></span></span><span
                              class="mjx-sub"
                              style="font-size: 70.7%; vertical-align: -0.36em; padding-right: 0.071em;"><span
                                class="mjx-texatom" style=""><span class="mjx-mrow"><span
                                    class="mjx-mn"><span class="mjx-char MJXc-TeX-main-R"
                                      style="padding-top: 0.372em; padding-bottom: 0.372em;">2</span></span></span></span></span></span><span
                            class="mjx-mo MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.077em; padding-bottom: 0.298em;">=</span></span><span
                            class="mjx-mo MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">(</span></span><span
                            class="mjx-mn"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.372em; padding-bottom: 0.372em;">0.9</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="margin-top: -0.144em; padding-bottom: 0.519em;">,</span></span><span
                            class="mjx-mn MJXc-space1"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.372em; padding-bottom: 0.372em;">0.999</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">)</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="margin-top: -0.144em; padding-bottom: 0.519em;">,</span></span><span
                            class="mjx-texatom MJXc-space1"><span class="mjx-mrow"><span
                                class="mjx-mo"><span class="mjx-char MJXc-TeX-math-I"
                                  style="padding-top: 0.225em; padding-bottom: 0.298em;">ϵ</span></span></span></span><span
                            class="mjx-mo MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.077em; padding-bottom: 0.298em;">=</span></span><span
                            class="mjx-mn MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.372em; padding-bottom: 0.372em;">1</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">e</span></span><span
                            class="mjx-mo MJXc-space2"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.298em; padding-bottom: 0.446em;"></span></span><span
                            class="mjx-mn MJXc-space2"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.372em; padding-bottom: 0.372em;">08</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="margin-top: -0.144em; padding-bottom: 0.519em;">,</span></span><span
                            class="mjx-mtext MJXc-space1"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.372em; padding-bottom: 0.519em;">weight
                              decay</span></span><span class="mjx-mo MJXc-space3"><span
                              class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.077em; padding-bottom: 0.298em;">=</span></span><span
                            class="mjx-mn MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.372em; padding-bottom: 0.372em;">0.0</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="margin-top: -0.144em; padding-bottom: 0.519em;">,</span></span><span
                            class="mjx-mtext MJXc-space1"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.372em; padding-bottom: 0.519em;">amsgrad</span></span><span
                            class="mjx-mo MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.077em; padding-bottom: 0.298em;">=</span></span><span
                            class="mjx-texatom MJXc-space3"><span class="mjx-mrow"><span
                                class="mjx-mtext"><span class="mjx-char MJXc-TeX-type-R"
                                  style="padding-top: 0.372em; padding-bottom: 0.298em;">False</span></span></span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">)</span></span></span></span></span></span></span></span></span></span>.
        </p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">For all experiments, we used a
          batch size of 8. We specify a number of epochs in our configuration files (an epoch is one
          iteration through the entire training dataset) but in practice we found that the number of
          windows is so large that we did not complete one entire epoch of training before network
          performance on the validation set met the criteria for early stopping, as described in the
          next paragraph.</p>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading" id="early-stopping">Early
          stopping</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">To mitigate the tendency of
          neural networks to overfit, we employed early stopping. Error rates are measured on a
          validation set every val_step training steps, and training stops early if these error
          rates do not decrease after patience consecutive validation steps, where val_step and
          patience are option values declared by a user in configuration files for the vak library.
          We chose to specify validation in terms of a global step instead of epoch, because as just
          stated the size of the dataset of all possible windows is so large that training rarely
          completed an entire epoch. For each bird, the validation data set was kept separate from
          the training and test data sets. For Bengalese finches and canaries we used: <span
            itemscope="" itemtype="http://schema.stenci.la/MathFragment"><span
              class="mjx-chtml"><span class="mjx-math"
                aria-label="{\displaystyle (\mathtt{\text{val\_step}}=250,\mathtt{\text{patience}}=4,)}"><span
                  class="mjx-mrow" aria-hidden="true"><span class="mjx-texatom"><span
                      class="mjx-mrow"><span class="mjx-mstyle"><span class="mjx-mrow"><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">(</span></span><span
                            class="mjx-texatom"><span class="mjx-mrow"><span class="mjx-mtext"><span
                                  class="mjx-char MJXc-TeX-type-R"
                                  style="padding-top: 0.519em; padding-bottom: 0.519em;">val\_step</span></span></span></span><span
                            class="mjx-mo MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.077em; padding-bottom: 0.298em;">=</span></span><span
                            class="mjx-mn MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.372em; padding-bottom: 0.372em;">250</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="margin-top: -0.144em; padding-bottom: 0.519em;">,</span></span><span
                            class="mjx-texatom MJXc-space1"><span class="mjx-mrow"><span
                                class="mjx-mtext"><span class="mjx-char MJXc-TeX-type-R"
                                  style="padding-top: 0.372em; padding-bottom: 0.519em;">patience</span></span></span></span><span
                            class="mjx-mo MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.077em; padding-bottom: 0.298em;">=</span></span><span
                            class="mjx-mn MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.372em; padding-bottom: 0.372em;">4</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="margin-top: -0.144em; padding-bottom: 0.519em;">,</span></span><span
                            class="mjx-mo MJXc-space1"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">)</span></span></span></span></span></span></span></span></span></span>.
        </p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading" id="learning-curves">Learning
          Curves</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">To estimate how much manually
          annotated training data is required to achieve a certain level of model performance, we
          generated learning curves that plot a metric such as frame error as a function of the
          amount of training data, as in the experiments shown in <a href="#fig3" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figures 3</a> and <a href="#fig4" itemscope=""
            itemtype="http://schema.stenci.la/Link"><span
              data-itemtype="http://schema.org/Number">4</span></a>. These experiments followed
          standard methods for benchmarking supervised machine learning algorithms, following good
          practices <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib37"><span>37</span><span>James et al.</span><span>2013</span></a></cite>,
          such as training multiple replicates on separate subsets of the training data. Producing
          these learning curves where the dataset size is measured in duration required extra steps
          not needed for other tasks such as image classification. For each individual bird, we fit
          networks with training sets of increasing size (duration in seconds) and then measured
          performance on a separate, fixed test set. For each training replicate, audio files were
          drawn at random from a fixed-size total training set until the target size (e.g. 60 s) was
          reached. If the total duration of the randomly drawn audio files extended beyond the
          target, they were clipped at that target duration while ensuring that all syllable classes
          were still present in the training set. After training completed, we computed metrics such
          as frame error and syllable error rate on the held-out test set for each bird. As stated,
          we chose to use a totally separate fixed-size set, instead of e.g. using the remainder of
          the training data set, or generating multiple test sets in a <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="k"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.446em; padding-bottom: 0.298em;">k</span></span></span></span></span></span>-fold
          validation scheme. We did this for two reasons: first, because computing metrics on
          relatively large test sets can be computationally expensive, and second, because we wanted
          to be sure that any variance in our measures across training replicates could be
          attributed to the randomly drawn training set, and not to changes in the test set.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">In the case of Bengalese
          finches, we used training sets with durations {30, 45, 75, 120, 180, 480, 600}, training
          10 replicates for each duration, with subsets drawn randomly from a total training set of
          900 seconds for each individual bird. The duration of the fixed test set for each bird was
          400 s. For canaries, we used training sets of durations {120, 180, 240, 300, 360, 420,
          480, 540, 600, 660}, training seven replicates for each duration, with subsets drawn
          randomly from a total training set of 25,000 s for each bird. The duration of the fixed
          test set for each bird was 5000 s. For the point estimate of the model’s asymptotic
          syllable error rate on canary song, we used a training set of 6000 s and a test set of
          5000 s.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">The method for generating
          learning curves as just described is built into the vak library and can be reproduced
          using the learncurve command in the terminal, along with the configuration files we
          shared.</p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="comparison-with-a-support-vector-machine-model">Comparison with a support vector
          machine model</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">In <a href="#fig3" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 3</a>, we compare performance of
          TweetyNet with a Support Vector Machine (SVM) model. We trained the model on a set of
          audio features first described in <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib85"><span>85</span><span>Tachibana
                et al.</span><span>2014</span></a></cite>. Feature extraction code was translated to
          Python from original Matlab code kindly shared by the author. Based on previous work <cite
            itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib57"><span>57</span><span>Nicholson</span><span>2016</span></a></cite>, we
          used a Support Vector Machine with a radial basis function (RBF) kernel. To find good
          values for the kernel coefficient <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="\gamma"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.519em; padding-right: 0.025em;">γ</span></span></span></span></span></span>
          and the regularization parameter <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="C"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.519em; padding-bottom: 0.298em; padding-right: 0.045em;">C</span></span></span></span></span></span>,
          we performed halving random search across a range of values (<span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="\gamma"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.519em; padding-right: 0.025em;">γ</span></span></span></span></span></span>=(1e-9,
          1e-3), <span itemscope="" itemtype="http://schema.stenci.la/MathFragment"><span
              class="mjx-chtml"><span class="mjx-math" aria-label="C"><span class="mjx-mrow"
                  aria-hidden="true"><span class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.519em; padding-bottom: 0.298em; padding-right: 0.045em;">C</span></span></span></span></span></span>=(60,1e10),
          log uniform distribution). In initial tests, we found that values of <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="\gamma"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.519em; padding-right: 0.025em;">γ</span></span></span></span></span></span>
          larger than 1.0 tended to produce pathological behavior where the model predicted one
          class for all features. We chose ranges for hyperparameter search that avoided this
          behavior. To carry out hyperparameter search we developed a pipeline in scikit-learn <span
            itemscope="" itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib70"><span>70</span><span>Pedregosa et
                  al.</span><span>2011</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib32"><span>32</span><span>Grisel
                  et al.</span><span>2020</span></a></cite></span>. In very rare cases, for two of
          the four birds, we needed to perform more than one run of the pipeline to find
          hyperparameters that did not cause it. To extract features and train models we adapted
          code from the hybrid-vocal-classifier library <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a
              href="#bib61"><span>61</span><span>Nicholson</span><span>2021</span></a></cite>, which
          provides a high-level interface to scikit-learn, and our pipeline including hyperparameter
          tuning was similarly built with scikit-learn code. All SVM models were trained on the
          exact same train-test splits used for training TweetyNet, by using dataset files generated
          by vak. This meant that for each training set duration there were 10 replicates trained
          for Bengalese finch song and seven replicates for canary song.</p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading" id="statistics">Statistics</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">To compare syllable error
          rates, we used the Wilcoxon paired signed-rank test, a non-parametric alternative to the
          T-test. We computed the test once for each training set duration, using paired samples:
          same number of training replicates that were each trained on a randomly drawn subset of
          training data, where the ‘factor’ within each pair of replicates was the model used,
          TweetyNet or the SVM. To test for homogeneity of variance, we used Levene’s test.</p>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="additional-analysis-of-model-performance">Additional analysis of model performance
        </h4>
        <h5 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="percentage-of-errors-near-boundaries">Percentage of errors near boundaries</h5>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">In ’Simple post-processing
          greatly reduces syllable error rates‘ we estimate the percentage of errors near
          boundaries. By ‘boundaries’ we mean the onset and offset times of syllables when they are
          manually annotated. A distribution of syllable durations computed from these onsets and
          offsets shows that boundaries are not static. There are two sources of this variation:
          naturally occurring motor variability in birdsong, and an additional noise component added
          by audio segmentation and human annotators. This variance in turn gives rise to frame
          errors, where the ground truth annotation and a trained TweetyNet model disagree about
          which of the time bins should be assigned the ’background’ label. These frame errors very
          close to boundaries are likely to have a much smaller impact on the syllable error rate
          than frame errors in the middle of syllables, because near the boundary they have no
          effect on the sequence of labels produced by segmenting the model output, and only a minor
          effect on the estimated onset and offset times. In the main text and in <a href="#fig5s1"
            itemscope="" itemtype="http://schema.stenci.la/Link">Figure 5—figure supplement 1</a> we
          estimate the percent of all such frame errors occurring at these noisy syllable onset and
          offset boundaries. To do so, we computed for every onset and offset the number of frame
          errors within a fixed distance of two time bins that specifically involved disagreement
          between the ground truth annotation and the trained model on the the ’background’ class.
        </p>
        <h5 itemscope="" itemtype="http://schema.stenci.la/Heading" id="errors-in-rare-sequences">
          Errors in rare sequences</h5>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">For all sequence of Bengalese
          finch syllables a-b we examined all possibilities for the following syllable and
          identified the most frequent sequence, a-b-<strong itemscope=""
            itemtype="http://schema.stenci.la/Strong">x</strong>. Then, among all sequences
          a-b-<strong itemscope="" itemtype="http://schema.stenci.la/Strong">y</strong> that are at
          least four times less frequent than a-b-<strong itemscope=""
            itemtype="http://schema.stenci.la/Strong">x</strong>, we measured the frame error during
          the syllable <strong itemscope="" itemtype="http://schema.stenci.la/Strong">y</strong>.
          This detailed analysis showed that there is a very small effect on rare variants. Namely,
          even if the sequence a-b-<strong itemscope=""
            itemtype="http://schema.stenci.la/Strong">y</strong> appears 100–1000 times less
          frequently than a-b-<strong itemscope=""
            itemtype="http://schema.stenci.la/Strong">x</strong> it does not incur high error rates
          in most cases. We use two statistical tests to quantify this claim. First, we measure the
          Pearson correlation between the relative frequency of the rare event (prob(a-b-<strong
            itemscope="" itemtype="http://schema.stenci.la/Strong">y</strong>) divided by
          prob(a-b-<strong itemscope="" itemtype="http://schema.stenci.la/Strong">x</strong>)) and
          the frame error in the syllable ’y’ (the fraction of spectrogram time bins not labeled ’y’
          within that segment). Second, we divide the rare events to the more rare and more common
          (relative frequency smaller or larger than 1/8) and measure the fraction of rare events
          exceeding the median error rate. We use the binomial z-test to compare the fraction and
          show that the difference is not significant.</p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="model-output-as-syllable-likelihoods">Model output as syllable likelihoods</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">In <a href="#fig9" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 9</a>, we present model outputs one step
          prior to assigning the most likely label to each spectrogram time bin. At that stage, one
          before the <em itemscope="" itemtype="http://schema.stenci.la/Emphasis">argmax(N</em>)
          step in <a href="#fig2" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 2</a>,
          the model output for a given time bin <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="t"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.372em; padding-bottom: 0.298em;">t</span></span></span></span></span></span>
          is a real-valued affinity <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="a(t,s)\in ℛ"><span class="mjx-mrow"
                  aria-hidden="true"><span class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.298em;">a</span></span><span
                    class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.446em; padding-bottom: 0.593em;">(</span></span><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.372em; padding-bottom: 0.298em;">t</span></span><span
                    class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                      style="margin-top: -0.144em; padding-bottom: 0.519em;">,</span></span><span
                    class="mjx-mi MJXc-space1"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.298em;">s</span></span><span
                    class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.446em; padding-bottom: 0.593em;">)</span></span><span
                    class="mjx-mo MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.225em; padding-bottom: 0.372em;"></span></span><span
                    class="mjx-texatom MJXc-space3"><span class="mjx-mrow"><span
                        class="mjx-mo"><span class="mjx-char MJXc-TeX-script-R"
                          style="padding-top: 0.519em; padding-bottom: 0.225em;">R</span></span></span></span></span></span></span></span>
          of all predefined syllable classes <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="s"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.298em;">s</span></span></span></span></span></span>.
          In <a href="#fig9" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 9</a> we
          convert these numbers to likelihoods by subtracting the minimum value and normalizing
          separately for each time bin <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math"
                aria-label="L(t,s)=\frac{a(t,s)-{\mathrm{min}}_{{s}^{\prime }}a(t,{s}^{\prime })}{{\displaystyle {\sum }_{\sigma }}[a(t,\sigma )-{\mathrm{min}}_{{s}^{\prime }}a(t,{s}^{\prime })]}"><span
                  class="mjx-mrow" aria-hidden="true"><span class="mjx-mi"><span
                      class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.446em; padding-bottom: 0.298em;">L</span></span><span
                    class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.446em; padding-bottom: 0.593em;">(</span></span><span
                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.372em; padding-bottom: 0.298em;">t</span></span><span
                    class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                      style="margin-top: -0.144em; padding-bottom: 0.519em;">,</span></span><span
                    class="mjx-mi MJXc-space1"><span class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.298em;">s</span></span><span
                    class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.446em; padding-bottom: 0.593em;">)</span></span><span
                    class="mjx-mo MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.077em; padding-bottom: 0.298em;">=</span></span><span
                    class="mjx-mfrac MJXc-space3"><span class="mjx-box MJXc-stacked"
                      style="width: 8.324em; padding: 0px 0.12em;"><span class="mjx-numerator"
                        style="font-size: 70.7%; width: 11.772em; top: -1.911em;"><span
                          class="mjx-mrow" style=""><span class="mjx-mi"><span
                              class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">a</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">(</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.372em; padding-bottom: 0.298em;">t</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="margin-top: -0.144em; padding-bottom: 0.519em;">,</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">s</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">)</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.298em; padding-bottom: 0.446em;"></span></span><span
                            class="mjx-msubsup"><span class="mjx-base"><span
                                class="mjx-texatom"><span class="mjx-mrow"><span
                                    class="mjx-texatom"><span class="mjx-mrow"><span
                                        class="mjx-mi"><span class="mjx-char MJXc-TeX-main-R"
                                          style="padding-top: 0.151em; padding-bottom: 0.372em;">m</span></span><span
                                        class="mjx-mi"><span class="mjx-char MJXc-TeX-main-R"
                                          style="padding-top: 0.372em; padding-bottom: 0.372em;">i</span></span><span
                                        class="mjx-mi"><span class="mjx-char MJXc-TeX-main-R"
                                          style="padding-top: 0.151em; padding-bottom: 0.372em;">n</span></span></span></span></span></span></span><span
                              class="mjx-sub"
                              style="font-size: 83.3%; vertical-align: -0.45em; padding-right: 0.06em;"><span
                                class="mjx-texatom" style=""><span class="mjx-mrow"><span
                                    class="mjx-msubsup"><span class="mjx-base"><span
                                        class="mjx-texatom"><span class="mjx-mrow"><span
                                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                                              style="padding-top: 0.225em; padding-bottom: 0.298em;">s</span></span></span></span></span><span
                                      class="mjx-sup"
                                      style="vertical-align: 0.289em; padding-left: 0px; padding-right: 0.05em;"><span
                                        class="mjx-texatom"><span class="mjx-mrow"><span
                                            class="mjx-mi"><span class="mjx-char MJXc-TeX-main-R"
                                              style="padding-top: 0.298em; padding-bottom: 0.298em;"></span></span></span></span></span></span></span></span></span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">a</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">(</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.372em; padding-bottom: 0.298em;">t</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="margin-top: -0.144em; padding-bottom: 0.519em;">,</span></span><span
                            class="mjx-msubsup"><span class="mjx-base"><span
                                class="mjx-texatom"><span class="mjx-mrow"><span
                                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                                      style="padding-top: 0.225em; padding-bottom: 0.298em;">s</span></span></span></span></span><span
                              class="mjx-sup"
                              style="font-size: 83.3%; vertical-align: 0.435em; padding-left: 0px; padding-right: 0.06em;"><span
                                class="mjx-texatom" style=""><span class="mjx-mrow"><span
                                    class="mjx-mi"><span class="mjx-char MJXc-TeX-main-R"
                                      style="padding-top: 0.298em; padding-bottom: 0.298em;"></span></span></span></span></span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">)</span></span></span></span><span
                        class="mjx-denominator"
                        style="font-size: 70.7%; width: 11.772em; bottom: -2.086em;"><span
                          class="mjx-mrow" style=""><span class="mjx-texatom"><span
                              class="mjx-mrow"><span class="mjx-mstyle"
                                style="font-size: 141.4%;"><span class="mjx-mrow"><span
                                    class="mjx-msubsup"><span class="mjx-base"><span
                                        class="mjx-texatom"><span class="mjx-mrow"><span
                                            class="mjx-mo"><span class="mjx-char MJXc-TeX-size2-R"
                                              style="padding-top: 0.74em; padding-bottom: 0.74em;"></span></span></span></span></span><span
                                      class="mjx-sub"
                                      style="font-size: 70.7%; vertical-align: -0.722em; padding-right: 0.071em;"><span
                                        class="mjx-texatom" style=""><span class="mjx-mrow"><span
                                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                                              style="padding-top: 0.225em; padding-bottom: 0.298em; padding-right: 0.001em;">σ</span></span></span></span></span></span></span></span></span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">[</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">a</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">(</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.372em; padding-bottom: 0.298em;">t</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="margin-top: -0.144em; padding-bottom: 0.519em;">,</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em; padding-right: 0.001em;">σ</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">)</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.298em; padding-bottom: 0.446em;"></span></span><span
                            class="mjx-msubsup"><span class="mjx-base"><span
                                class="mjx-texatom"><span class="mjx-mrow"><span
                                    class="mjx-texatom"><span class="mjx-mrow"><span
                                        class="mjx-mi"><span class="mjx-char MJXc-TeX-main-R"
                                          style="padding-top: 0.151em; padding-bottom: 0.372em;">m</span></span><span
                                        class="mjx-mi"><span class="mjx-char MJXc-TeX-main-R"
                                          style="padding-top: 0.372em; padding-bottom: 0.372em;">i</span></span><span
                                        class="mjx-mi"><span class="mjx-char MJXc-TeX-main-R"
                                          style="padding-top: 0.151em; padding-bottom: 0.372em;">n</span></span></span></span></span></span></span><span
                              class="mjx-sub"
                              style="font-size: 83.3%; vertical-align: -0.45em; padding-right: 0.06em;"><span
                                class="mjx-texatom" style=""><span class="mjx-mrow"><span
                                    class="mjx-msubsup"><span class="mjx-base"><span
                                        class="mjx-texatom"><span class="mjx-mrow"><span
                                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                                              style="padding-top: 0.225em; padding-bottom: 0.298em;">s</span></span></span></span></span><span
                                      class="mjx-sup"
                                      style="vertical-align: 0.289em; padding-left: 0px; padding-right: 0.05em;"><span
                                        class="mjx-texatom"><span class="mjx-mrow"><span
                                            class="mjx-mi"><span class="mjx-char MJXc-TeX-main-R"
                                              style="padding-top: 0.298em; padding-bottom: 0.298em;"></span></span></span></span></span></span></span></span></span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">a</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">(</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.372em; padding-bottom: 0.298em;">t</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="margin-top: -0.144em; padding-bottom: 0.519em;">,</span></span><span
                            class="mjx-msubsup"><span class="mjx-base"><span
                                class="mjx-texatom"><span class="mjx-mrow"><span
                                    class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                                      style="padding-top: 0.225em; padding-bottom: 0.298em;">s</span></span></span></span></span><span
                              class="mjx-sup"
                              style="font-size: 83.3%; vertical-align: 0.347em; padding-left: 0px; padding-right: 0.06em;"><span
                                class="mjx-texatom" style=""><span class="mjx-mrow"><span
                                    class="mjx-mi"><span class="mjx-char MJXc-TeX-main-R"
                                      style="padding-top: 0.298em; padding-bottom: 0.298em;"></span></span></span></span></span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">)</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">]</span></span></span></span><span
                        style="border-bottom: 1.3px solid; top: -0.296em; width: 8.324em;"
                        class="mjx-line"></span></span><span
                      style="height: 2.826em; vertical-align: -1.475em;"
                      class="mjx-vsize"></span></span></span></span></span></span>. This
          transformation was done for presentation only. Applying the commonly-used softmax
          transform (<span itemscope="" itemtype="http://schema.stenci.la/MathFragment"><span
              class="mjx-chtml"><span class="mjx-math"
                aria-label="x\to \frac{exp(x)}{{\displaystyle {\sum }_{x}}exp(x)}"><span
                  class="mjx-mrow" aria-hidden="true"><span class="mjx-mi"><span
                      class="mjx-char MJXc-TeX-math-I"
                      style="padding-top: 0.225em; padding-bottom: 0.298em;">x</span></span><span
                    class="mjx-mo MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.225em; padding-bottom: 0.372em;"></span></span><span
                    class="mjx-mfrac MJXc-space3"><span class="mjx-box MJXc-stacked"
                      style="width: 4.069em; padding: 0px 0.12em;"><span class="mjx-numerator"
                        style="font-size: 70.7%; width: 5.755em; top: -1.706em;"><span
                          class="mjx-mrow" style=""><span class="mjx-mi"><span
                              class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">e</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">x</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.446em;">p</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">(</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">x</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">)</span></span></span></span><span
                        class="mjx-denominator"
                        style="font-size: 70.7%; width: 5.755em; bottom: -2.086em;"><span
                          class="mjx-mrow" style=""><span class="mjx-texatom"><span
                              class="mjx-mrow"><span class="mjx-mstyle"
                                style="font-size: 141.4%;"><span class="mjx-mrow"><span
                                    class="mjx-msubsup"><span class="mjx-base"><span
                                        class="mjx-texatom"><span class="mjx-mrow"><span
                                            class="mjx-mo"><span class="mjx-char MJXc-TeX-size2-R"
                                              style="padding-top: 0.74em; padding-bottom: 0.74em;"></span></span></span></span></span><span
                                      class="mjx-sub"
                                      style="font-size: 70.7%; vertical-align: -0.722em; padding-right: 0.071em;"><span
                                        class="mjx-texatom" style=""><span class="mjx-mrow"><span
                                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                                              style="padding-top: 0.225em; padding-bottom: 0.298em;">x</span></span></span></span></span></span></span></span></span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">e</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">x</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.446em;">p</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">(</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">x</span></span><span
                            class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                              style="padding-top: 0.446em; padding-bottom: 0.593em;">)</span></span></span></span><span
                        style="border-bottom: 1.3px solid; top: -0.296em; width: 4.069em;"
                        class="mjx-line"></span></span><span
                      style="height: 2.682em; vertical-align: -1.475em;"
                      class="mjx-vsize"></span></span></span></span></span></span>) is equivalent
          since we only keep the maximal value.</p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="analysis-of-behavioral-data-and-predicted-annotations">Analysis of behavioral data and
          predicted annotations</h3>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="bengalese-finch-branch-points">Bengalese finch branch points</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">We analyzed the Bengalese finch
          song in <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib58"><span>58</span><span>Nicholson et al.</span><span>2017</span></a></cite>
          to determine whether we could replicate key findings about the stability of branch points
          from <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib96"><span>96</span><span>Warren et al.</span><span>2012</span></a></cite> as
          described in the main text, and, if so, whether we could recover that results from
          annotations predicted by TweetyNet.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">To analyze statistics of branch
          points in the Bengalese finch song from <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib58"><span>58</span><span>Nicholson
                et al.</span><span>2017</span></a></cite>, we first identified candidate branch
          points by visual inspection of each birds’ annotated song. Then, for each day of a bird’s
          song, we counted all occurrences of transitions from one syllable class to another, that
          is bigrams. We placed these counts in a matrix where rows were the first syllable of the
          bigram (‘from’) and the columns where the second syllable (‘to’), and then performed a
          row-wise normalization to produce a first-order Markov transition matrix, where the
          elements are transition probabilities from one syllable class to another. We also applied
          a thresholding so that any elements in the matrix less than 0.002 were set to 0.</p>
        <h5 itemscope="" itemtype="http://schema.stenci.la/Heading" id="statistical-test">
          Statistical test</h5>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">To test whether transition
          probabilities were stable across days, we used a permutation test, replicating the
          analysis of <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib96"><span>96</span><span>Warren et al.</span><span>2012</span></a></cite>.
          We took all occurrences of a transition point across two days, and then for each
          permutation, swapped the label for which day it belong to, and then computed the
          transition probabilities for the permuted days. Using 1000 permutations, we generated a
          distribution of diffrences and then asked whether the observed difference was larger than
          this bootstrapped distribution.</p>
        <h5 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="analysis-of-predicted-annotations">Analysis of predicted annotations</h5>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Before testing whether we could
          recover the result that branch points were stable from annotations predicted by TweetyNet,
          we measured the syllable error rate of the trained models that we would use to predict
          annotations. For 1 day of song from one bird (or60yw70), we realized that the
          manually-annotated set of songs was even smaller (200 s) than the test sets we used in the
          benchmarking section. We removed this day because it is not really an ‘entire day’ of
          song.</p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading" id="canary-syntax-model">Canary
          syntax model</h3>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="shared-template-dependence-on-number-of-syllables-in-song">Shared template dependence
          on number of syllables in song</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">In each bird, we define an
          upper bound for repeating parts of songs using pairwise comparisons. For each song we
          examined all other songs with equal or larger number of syllables and found the largest
          shared string of consecutive syllables. The fraction of shared syllables is the ratio
          between the number of shared sequence and the number of syllables in the first, shorter,
          song. Then, we bin songs by syllable counts (bin size is 10 syllables) and calculate the
          mean and standard deviation across all pairwise comparisons. Results are shown in <a
            href="#fig1s2" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 1—figure
            supplement 2</a>.</p>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading" id="probabilistic-suffix-trees">
          Probabilistic suffix trees</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">For each canary phrase type, we
          describe the dependency of the following transition on previous phrases with a
          probabilistic suffix tree. This method was described in a previous publication from our
          lab <span itemscope="" itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib50"><span>50</span><span>Markowitz et
                  al.</span><span>2013</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib51"><span>51</span><span>Markowitz</span><span>2022</span></a></cite></span>.
          Briefly, the tree is a directed graph in which each phrase type is a root node
          representing the first order (Markov) transition probabilities to downstream phrases,
          including the end of song. The pie charts in <a href="#fig7" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 7</a>, <a href="#fig7s1" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 7—figure supplement 1</a>, and <a
            href="#fig7s2" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 7—figure
            supplement 2</a> show such probabilities. Upstream nodes represent higher order Markov
          chains that are added sequentially if they significantly add information about the
          transition.</p>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="model-cross-validation-to-determine-minimal-node-frequency">Model cross validation to
          determine minimal node frequency</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">To prevent overfitting, nodes
          in the probabilistic suffix trees are added only if they appear more often than a
          threshold frequency, <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="{P}_{min}"><span class="mjx-mrow"
                  aria-hidden="true"><span class="mjx-msubsup"><span class="mjx-base"
                      style="margin-right: -0.109em;"><span class="mjx-texatom"><span
                          class="mjx-mrow"><span class="mjx-mi"><span
                              class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.446em; padding-bottom: 0.298em; padding-right: 0.109em;">P</span></span></span></span></span><span
                      class="mjx-sub"
                      style="font-size: 70.7%; vertical-align: -0.212em; padding-right: 0.071em;"><span
                        class="mjx-texatom" style=""><span class="mjx-mrow"><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">m</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.446em; padding-bottom: 0.298em;">i</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">n</span></span></span></span></span></span></span></span></span></span>.
          To determine <span itemscope="" itemtype="http://schema.stenci.la/MathFragment"><span
              class="mjx-chtml"><span class="mjx-math" aria-label="{P}_{min}"><span class="mjx-mrow"
                  aria-hidden="true"><span class="mjx-msubsup"><span class="mjx-base"
                      style="margin-right: -0.109em;"><span class="mjx-texatom"><span
                          class="mjx-mrow"><span class="mjx-mi"><span
                              class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.446em; padding-bottom: 0.298em; padding-right: 0.109em;">P</span></span></span></span></span><span
                      class="mjx-sub"
                      style="font-size: 70.7%; vertical-align: -0.212em; padding-right: 0.071em;"><span
                        class="mjx-texatom" style=""><span class="mjx-mrow"><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">m</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.446em; padding-bottom: 0.298em;">i</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">n</span></span></span></span></span></span></span></span></span></span>
          we replicate the procedure in <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib50"><span>50</span><span>Markowitz
                et al.</span><span>2013</span></a></cite> and carry a 10-fold model cross validation
          procedure. In this procedure the dataset is randomly divided into a training set,
          containing 90 percent of songs, and a test set, containing 10 percent of songs. A PST is
          created using the training set and used to calculate the negative log likelihood of the
          test set. This procedure is repeated 10 times for each value of <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="{P}_{min}"><span class="mjx-mrow"
                  aria-hidden="true"><span class="mjx-msubsup"><span class="mjx-base"
                      style="margin-right: -0.109em;"><span class="mjx-texatom"><span
                          class="mjx-mrow"><span class="mjx-mi"><span
                              class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.446em; padding-bottom: 0.298em; padding-right: 0.109em;">P</span></span></span></span></span><span
                      class="mjx-sub"
                      style="font-size: 70.7%; vertical-align: -0.212em; padding-right: 0.071em;"><span
                        class="mjx-texatom" style=""><span class="mjx-mrow"><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">m</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.446em; padding-bottom: 0.298em;">i</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">n</span></span></span></span></span></span></span></span></span></span>,
          the x-axis in <a href="#fig8" itemscope="" itemtype="http://schema.stenci.la/Link">Figure
            8a</a>. For data sets of different sizes (curves in <a href="#fig8" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 8a</a> x-axis in <a href="#fig8"
            itemscope="" itemtype="http://schema.stenci.la/Link">Figure 8b</a>) the mean negative
          log-likelihood across the 10 cross validation subsets and across 10 data sets, y-axis in
          <a href="#fig8" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 8a</a>, is
          then used to find the optimal value of <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="{P}_{min}"><span class="mjx-mrow"
                  aria-hidden="true"><span class="mjx-msubsup"><span class="mjx-base"
                      style="margin-right: -0.109em;"><span class="mjx-texatom"><span
                          class="mjx-mrow"><span class="mjx-mi"><span
                              class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.446em; padding-bottom: 0.298em; padding-right: 0.109em;">P</span></span></span></span></span><span
                      class="mjx-sub"
                      style="font-size: 70.7%; vertical-align: -0.212em; padding-right: 0.071em;"><span
                        class="mjx-texatom" style=""><span class="mjx-mrow"><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">m</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.446em; padding-bottom: 0.298em;">i</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">n</span></span></span></span></span></span></span></span></span></span>
          - the minimum negative log-likelihood that corresponds to the highest precision without
          over-fitting the training set. All PSTs in <a href="#fig7" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 7</a>, <a href="#fig7s1" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figure 7—figure supplement 1</a>, and <a
            href="#fig7s2" itemscope="" itemtype="http://schema.stenci.la/Link">Figure 7—figure
            supplement 2</a> are created using the cross-validated <span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="{P}_{min}"><span class="mjx-mrow"
                  aria-hidden="true"><span class="mjx-msubsup"><span class="mjx-base"
                      style="margin-right: -0.109em;"><span class="mjx-texatom"><span
                          class="mjx-mrow"><span class="mjx-mi"><span
                              class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.446em; padding-bottom: 0.298em; padding-right: 0.109em;">P</span></span></span></span></span><span
                      class="mjx-sub"
                      style="font-size: 70.7%; vertical-align: -0.212em; padding-right: 0.071em;"><span
                        class="mjx-texatom" style=""><span class="mjx-mrow"><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">m</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.446em; padding-bottom: 0.298em;">i</span></span><span
                            class="mjx-mi"><span class="mjx-char MJXc-TeX-math-I"
                              style="padding-top: 0.225em; padding-bottom: 0.298em;">n</span></span></span></span></span></span></span></span></span></span>.
        </p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading" id="code-availability">Code
          Availability</h3>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">The code implementing the
          TweetyNet architecture, and code to reproduce experiments and figures in this paper, are
          available <a href="https://github.com/yardencsGitHub/tweetynet" itemscope=""
            itemtype="http://schema.stenci.la/Link">here</a> (version 0.7.1,
          10.5281/zenodo.5823556).</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">To aid with reproducibility of
          our experiments, and to make TweetyNet more accessible to researchers studying birdsong
          and other animal vocalizations, we developed a software library, vak <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib59"><span>59</span><span>Nicholson
                and Cohen</span><span>2021</span></a></cite>, available <a
            href="https://github.com/NickleDave/vak" itemscope=""
            itemtype="http://schema.stenci.la/Link">here</a>.</p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">We also developed Python tools
          to work with the datasets and their annotation: <span itemscope=""
            itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib60"><span>60</span><span>Nicholson</span><span>2021</span></a></cite><cite
              itemscope="" itemtype="http://schema.stenci.la/Cite"><a
                href="#bib62"><span>62</span><span>Nicholson</span><span>2021</span></a></cite><cite
              itemscope="" itemtype="http://schema.stenci.la/Cite"><a
                href="#bib63"><span>63</span><span>Nicholson</span><span>2021</span></a></cite></span>.
        </p>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">All software was implemented in
          Python, leveraging the following open-source scientific libraries, frameworks, and tools:
          attrs <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib79"><span>79</span><span>Schlawack</span><span>2020</span></a></cite>, dask
          <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib21"><span>21</span><span>Dask Development
                Team</span><span>2016</span></a></cite>, jupyter <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib42"><span>42</span><span>Kluyver et
                al.</span><span>2016</span></a></cite>, matplotlib <span itemscope=""
            itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib36"><span>36</span><span>Hunter</span><span>2007</span></a></cite><cite
              itemscope="" itemtype="http://schema.stenci.la/Cite"><a
                href="#bib14"><span>14</span><span>Caswell et
                  al.</span><span>2020</span></a></cite></span>, numpy <span itemscope=""
            itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib93"><span>93</span><span>van der
                  Walt et al.</span><span>2011</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib33"><span>33</span><span>Harris
                  et al.</span><span>2020</span></a></cite></span>, pandas <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib67"><span>67</span><span>pandas
                development team</span><span>2020</span></a></cite>, scikit-learn <span itemscope=""
            itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib70"><span>70</span><span>Pedregosa et
                  al.</span><span>2011</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib32"><span>32</span><span>Grisel
                  et al.</span><span>2020</span></a></cite></span>, scipy <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib95"><span>95</span><span>Virtanen
                et al.</span><span>2020</span></a></cite>, torch <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib69"><span>69</span><span>Paszke et
                al.</span><span>2017</span></a></cite>, torchvision <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib49"><span>49</span><span>Marcel and
                Rodriguez</span><span>2010</span></a></cite>, seaborn <span itemscope=""
            itemtype="http://schema.stenci.la/CiteGroup"><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a href="#bib97"><span>97</span><span>Waskom
                  et al.</span><span>2020</span></a></cite><cite itemscope=""
              itemtype="http://schema.stenci.la/Cite"><a
                href="#bib98"><span>98</span><span>Waskom</span><span>2021</span></a></cite></span>,
          and tqdm <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib19"><span>19</span><span>da Costa-Luis</span><span>2019</span></a></cite>.
        </p>
        <h3 itemscope="" itemtype="http://schema.stenci.la/Heading" id="data-collection">Data
          Collection</h3>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading" id="use-of-existing-datasets">
          Use of existing datasets</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Bengalese finch song is from
          two publicly-available repositories. Results in <a href="#fig3" itemscope=""
            itemtype="http://schema.stenci.la/Link">Figures 3</a>, <a href="#fig4" itemscope=""
            itemtype="http://schema.stenci.la/Link"><span
              data-itemtype="http://schema.org/Number">4</span></a> and <a href="#fig6" itemscope=""
            itemtype="http://schema.stenci.la/Link"><span
              data-itemtype="http://schema.org/Number">6</span></a> all make use of &quot;<a
            href="https://figshare.com/articles/Bengalese_Finch_song_repository/4805749"
            itemscope="" itemtype="http://schema.stenci.la/Link">Bengalese finch Song
            Repository</a>&quot; <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib58"><span>58</span><span>Nicholson et
                al.</span><span>2017</span></a></cite>. For experiments in <a href="#fig4"
            itemscope="" itemtype="http://schema.stenci.la/Link">Figure 4</a> we added song from
          four Bengalese finches in an additional <a
            href="https://figshare.com/articles/BirdsongRecognition/3470165" itemscope=""
            itemtype="http://schema.stenci.la/Link">dataset</a>, <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a
              href="#bib44"><span>44</span><span>Koumura</span><span>2016</span></a></cite>, and
          accompanied the paper <cite itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib45"><span>45</span><span>Koumura and
                Okanoya</span><span>2016</span></a></cite>. Please see ‘Annotation of Bengalese
          finch song’ for a description of how we annotated that data. Supplementary figures with
          descriptive statistics of song also use datasets of Waterslager canary songs <cite
            itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib50"><span>50</span><span>Markowitz et
                al.</span><span>2013</span></a></cite>, Bengalese finch songs <cite itemscope=""
            itemtype="http://schema.stenci.la/Cite"><a href="#bib45"><span>45</span><span>Koumura
                and Okanoya</span><span>2016</span></a></cite> and Zebra finch songs <cite
            itemscope="" itemtype="http://schema.stenci.la/Cite"><a
              href="#bib66"><span>66</span><span>Otchy et al.</span><span>2015</span></a></cite>
          generously shared by those authors.</p>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="domestic-canary-song-screening">Domestic canary song screening</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">Birds were individually housed
          in soundproof boxes and recorded for 3–5 days (Audio-Technica AT831B Lavalier Condenser
          Microphone, M-Audio Octane amplifiers, HDSPe RayDAT sound card and VOS games&#39; Boom
          Recorder software on a Mac Pro desktop computer). In-house software was used to detect and
          save only sound segments that contained vocalizations. These recordings were used to
          select subjects that are copious singers (<span itemscope=""
            itemtype="http://schema.stenci.la/MathFragment"><span class="mjx-chtml"><span
                class="mjx-math" aria-label="\ge 50"><span class="mjx-mrow" aria-hidden="true"><span
                    class="mjx-mo"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.372em; padding-bottom: 0.446em;"></span></span><span
                    class="mjx-mn MJXc-space3"><span class="mjx-char MJXc-TeX-main-R"
                      style="padding-top: 0.372em; padding-bottom: 0.372em;">50</span></span></span></span></span></span>
          songs per day) and produce at least 10 different syllable types.</p>
        <h4 itemscope="" itemtype="http://schema.stenci.la/Heading"
          id="domestic-canary-audio-recording">Domestic canary audio recording</h4>
        <p itemscope="" itemtype="http://schema.stenci.la/Paragraph">All data used in this
          manuscript was acquired between late April and early May 2018 - a period during which
          canaries perform their mating season songs. Birds were individually housed in soundproof
          boxes and recorded for 7–10 days (Audio-Technica AT831B Lavalier Condenser Microphone,
          M-Audio M-track amplifiers, and VOS games&#39; Boom Recorder software on a Mac Pro desktop
          computer). In-house software was used to detect and save only sound segments that
          contained vocalizations. Separate songs were defined by silence gaps exceeding 1 second.
        </p>
        <section data-itemprop="references">
          <h2 data-itemtype="http://schema.stenci.la/Heading">References</h2>
          <ol>
            <li itemscope="" itemtype="http://schema.org/Article" itemprop="citation" id="bib1">
              <ol data-itemprop="authors">
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="O Abdel-Hamid"><span
                    data-itemprop="givenNames"><span itemprop="givenName">O</span></span><span
                    data-itemprop="familyNames"><span
                      itemprop="familyName">Abdel-Hamid</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="A Mohamed"><span data-itemprop="givenNames"><span
                      itemprop="givenName">A</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Mohamed</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="H Jiang"><span data-itemprop="givenNames"><span
                      itemprop="givenName">H</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Jiang</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="L Deng"><span data-itemprop="givenNames"><span
                      itemprop="givenName">L</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Deng</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="G Penn"><span data-itemprop="givenNames"><span
                      itemprop="givenName">G</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Penn</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="D Yu"><span data-itemprop="givenNames"><span
                      itemprop="givenName">D</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Yu</span></span>
                </li>
              </ol><time itemprop="datePublished" datetime="2014">2014</time><span
                itemprop="headline">Convolutional Neural Networks for Speech Recognition</span><span
                itemscope="" itemtype="http://schema.org/PublicationVolume"
                itemprop="isPartOf"><span itemprop="volumeNumber"
                  data-itemtype="http://schema.org/Number">22</span><span itemscope=""
                  itemtype="http://schema.org/Periodical" itemprop="isPartOf"><span
                    itemprop="name">IEEE/ACM Transactions on Audio, Speech, and Language
                    Processing</span></span></span><span itemprop="pageStart"
                data-itemtype="http://schema.org/Number">1533</span><span itemprop="pageEnd"
                data-itemtype="http://schema.org/Number">1545</span><span itemscope=""
                itemtype="http://schema.org/Organization" itemprop="publisher">
                <meta itemprop="name" content="Unknown"><span itemscope=""
                  itemtype="http://schema.org/ImageObject" itemprop="logo">
                  <meta itemprop="url"
                    content="https://via.placeholder.com/600x60/dbdbdb/4a4a4a.png?text=Unknown">
                </span>
              </span>
              <meta itemprop="image"
                content="https://via.placeholder.com/1200x714/dbdbdb/4a4a4a.png?text=Convolutional%20Neural%20Networks%20for%20Speech%20Recognition">
              <ul data-itemprop="identifiers">
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/doi"><span
                    itemprop="name">doi</span><span
                    itemprop="value">10.1109/TASLP.2014.2339736</span>
                </li>
              </ul>
            </li>
            <li itemscope="" itemtype="http://schema.org/Article" itemprop="citation" id="bib2">
              <ol data-itemprop="authors">
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="J Alliende"><span data-itemprop="givenNames"><span
                      itemprop="givenName">J</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Alliende</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="K Lehongre"><span data-itemprop="givenNames"><span
                      itemprop="givenName">K</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Lehongre</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="C Del Negro"><span data-itemprop="givenNames"><span
                      itemprop="givenName">C</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Del</span><span
                      itemprop="familyName">Negro</span></span>
                </li>
              </ol><time itemprop="datePublished" datetime="2013">2013</time><span
                itemprop="headline">A species-specific view of song representation in a sensorimotor
                nucleus</span><span itemscope="" itemtype="http://schema.org/PublicationVolume"
                itemprop="isPartOf"><span itemprop="volumeNumber"
                  data-itemtype="http://schema.org/Number">107</span><span itemscope=""
                  itemtype="http://schema.org/Periodical" itemprop="isPartOf"><span
                    itemprop="name">Journal of Physiology, Paris</span></span></span><span
                itemprop="pageStart" data-itemtype="http://schema.org/Number">193</span><span
                itemprop="pageEnd" data-itemtype="http://schema.org/Number">202</span><span
                itemscope="" itemtype="http://schema.org/Organization" itemprop="publisher">
                <meta itemprop="name" content="Unknown"><span itemscope=""
                  itemtype="http://schema.org/ImageObject" itemprop="logo">
                  <meta itemprop="url"
                    content="https://via.placeholder.com/600x60/dbdbdb/4a4a4a.png?text=Unknown">
                </span>
              </span>
              <meta itemprop="image"
                content="https://via.placeholder.com/1200x714/dbdbdb/4a4a4a.png?text=A%20species-specific%20view%20of%20song%20representation%20in%20a%20sensorimotor%20nucleus">
              <ul data-itemprop="identifiers">
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/doi"><span
                    itemprop="name">doi</span><span
                    itemprop="value">10.1016/j.jphysparis.2012.08.004</span>
                </li>
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/pmid"><span
                    itemprop="name">pmid</span><span itemprop="value"
                    data-itemtype="http://schema.org/Number">22960663</span>
                </li>
              </ul>
            </li>
            <li itemscope="" itemtype="http://schema.org/Article" itemprop="citation" id="bib3">
              <ol data-itemprop="authors">
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="LM Alonso"><span data-itemprop="givenNames"><span
                      itemprop="givenName">LM</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Alonso</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="JA Alliende"><span data-itemprop="givenNames"><span
                      itemprop="givenName">JA</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Alliende</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="F Goller"><span data-itemprop="givenNames"><span
                      itemprop="givenName">F</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Goller</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="GB Mindlin"><span data-itemprop="givenNames"><span
                      itemprop="givenName">GB</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Mindlin</span></span>
                </li>
              </ol><time itemprop="datePublished" datetime="2009">2009</time><span
                itemprop="headline">Low-dimensional dynamical model for the diversity of pressure
                patterns used in canary song</span><span itemscope=""
                itemtype="http://schema.org/PublicationVolume" itemprop="isPartOf"><span
                  itemprop="volumeNumber" data-itemtype="http://schema.org/Number">79</span><span
                  itemscope="" itemtype="http://schema.org/Periodical" itemprop="isPartOf"><span
                    itemprop="name">Physical Review. E, Statistical, Nonlinear, and Soft Matter
                    Physics</span></span></span><span itemscope=""
                itemtype="http://schema.org/Organization" itemprop="publisher">
                <meta itemprop="name" content="Unknown"><span itemscope=""
                  itemtype="http://schema.org/ImageObject" itemprop="logo">
                  <meta itemprop="url"
                    content="https://via.placeholder.com/600x60/dbdbdb/4a4a4a.png?text=Unknown">
                </span>
              </span>
              <meta itemprop="image"
                content="https://via.placeholder.com/1200x714/dbdbdb/4a4a4a.png?text=Low-dimensional%20dynamical%20model%20for%20the%20diversity%20of%20pressure%20patterns%20used%20in%20canary%20song">
              <ul data-itemprop="identifiers">
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/doi"><span
                    itemprop="name">doi</span><span
                    itemprop="value">10.1103/PhysRevE.79.041929</span>
                </li>
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/pmid"><span
                    itemprop="name">pmid</span><span itemprop="value"
                    data-itemtype="http://schema.org/Number">19518278</span>
                </li>
              </ul>
            </li>
            <li itemscope="" itemtype="http://schema.org/Article" itemprop="citation" id="bib4">
              <ol data-itemprop="authors">
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="A Alvarez-Buylla"><span
                    data-itemprop="givenNames"><span itemprop="givenName">A</span></span><span
                    data-itemprop="familyNames"><span
                      itemprop="familyName">Alvarez-Buylla</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="JR Kirn"><span data-itemprop="givenNames"><span
                      itemprop="givenName">JR</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Kirn</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="F Nottebohm"><span data-itemprop="givenNames"><span
                      itemprop="givenName">F</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Nottebohm</span></span>
                </li>
              </ol><time itemprop="datePublished" datetime="1990">1990</time><span
                itemprop="headline">Birth of projection neurons in adult avian brain may be related
                to perceptual or motor learning</span><span itemscope=""
                itemtype="http://schema.org/PublicationVolume" itemprop="isPartOf"><span
                  itemprop="volumeNumber" data-itemtype="http://schema.org/Number">249</span><span
                  itemscope="" itemtype="http://schema.org/Periodical" itemprop="isPartOf"><span
                    itemprop="name">Science</span></span></span><span itemprop="pageStart"
                data-itemtype="http://schema.org/Number">1444</span><span itemprop="pageEnd"
                data-itemtype="http://schema.org/Number">1446</span><span itemscope=""
                itemtype="http://schema.org/Organization" itemprop="publisher">
                <meta itemprop="name" content="Unknown"><span itemscope=""
                  itemtype="http://schema.org/ImageObject" itemprop="logo">
                  <meta itemprop="url"
                    content="https://via.placeholder.com/600x60/dbdbdb/4a4a4a.png?text=Unknown">
                </span>
              </span>
              <meta itemprop="image"
                content="https://via.placeholder.com/1200x714/dbdbdb/4a4a4a.png?text=Birth%20of%20projection%20neurons%20in%20adult%20avian%20brain%20may%20be%20related%20to%20perceptual%20or%20motor%20learning">
              <ul data-itemprop="identifiers">
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/doi"><span
                    itemprop="name">doi</span><span itemprop="value">10.1126/science.1698312</span>
                </li>
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/pmid"><span
                    itemprop="name">pmid</span><span itemprop="value"
                    data-itemtype="http://schema.org/Number">1698312</span>
                </li>
              </ul>
            </li>
            <li itemscope="" itemtype="http://schema.org/Article" itemprop="citation" id="bib5">
              <ol data-itemprop="authors">
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="D Amodei"><span data-itemprop="givenNames"><span
                      itemprop="givenName">D</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Amodei</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="S Ananthanarayanan"><span
                    data-itemprop="givenNames"><span itemprop="givenName">S</span></span><span
                    data-itemprop="familyNames"><span
                      itemprop="familyName">Ananthanarayanan</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="R Anubhai"><span data-itemprop="givenNames"><span
                      itemprop="givenName">R</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Anubhai</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="J Bai"><span data-itemprop="givenNames"><span
                      itemprop="givenName">J</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Bai</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="E Battenberg"><span
                    data-itemprop="givenNames"><span itemprop="givenName">E</span></span><span
                    data-itemprop="familyNames"><span itemprop="familyName">Battenberg</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="C Case"><span data-itemprop="givenNames"><span
                      itemprop="givenName">C</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Case</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="J Casper"><span data-itemprop="givenNames"><span
                      itemprop="givenName">J</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Casper</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="B Catanzaro"><span data-itemprop="givenNames"><span
                      itemprop="givenName">B</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Catanzaro</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="Q Cheng"><span data-itemprop="givenNames"><span
                      itemprop="givenName">Q</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Cheng</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="G Chen"><span data-itemprop="givenNames"><span
                      itemprop="givenName">G</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Chen</span></span>
                </li>
              </ol><time itemprop="datePublished" datetime="2016">2016</time><span
                itemprop="headline">In International Conference on Machine Learning PMLR</span><span
                itemprop="pageStart" data-itemtype="http://schema.org/Number">173</span><span
                itemprop="pageEnd" data-itemtype="http://schema.org/Number">182</span><span
                itemscope="" itemtype="http://schema.org/Organization" itemprop="publisher">
                <meta itemprop="name" content="Unknown"><span itemscope=""
                  itemtype="http://schema.org/ImageObject" itemprop="logo">
                  <meta itemprop="url"
                    content="https://via.placeholder.com/600x60/dbdbdb/4a4a4a.png?text=Unknown">
                </span>
              </span>
              <meta itemprop="image"
                content="https://via.placeholder.com/1200x714/dbdbdb/4a4a4a.png?text=In%20International%20Conference%20on%20Machine%20Learning%20PMLR">
            </li>
            <li itemscope="" itemtype="http://schema.org/Article" itemprop="citation" id="bib6">
              <ol data-itemprop="authors">
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="SE Anderson"><span data-itemprop="givenNames"><span
                      itemprop="givenName">SE</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Anderson</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="AS Dave"><span data-itemprop="givenNames"><span
                      itemprop="givenName">AS</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Dave</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="D Margoliash"><span
                    data-itemprop="givenNames"><span itemprop="givenName">D</span></span><span
                    data-itemprop="familyNames"><span itemprop="familyName">Margoliash</span></span>
                </li>
              </ol><time itemprop="datePublished" datetime="1996">1996</time><span
                itemprop="headline">Template-based automatic recognition of birdsong syllables from
                continuous recordings</span><span itemscope=""
                itemtype="http://schema.org/PublicationVolume" itemprop="isPartOf"><span
                  itemprop="volumeNumber" data-itemtype="http://schema.org/Number">100</span><span
                  itemscope="" itemtype="http://schema.org/Periodical" itemprop="isPartOf"><span
                    itemprop="name">The Journal of the Acoustical Society of
                    America</span></span></span><span itemprop="pageStart"
                data-itemtype="http://schema.org/Number">1209</span><span itemprop="pageEnd"
                data-itemtype="http://schema.org/Number">1219</span><span itemscope=""
                itemtype="http://schema.org/Organization" itemprop="publisher">
                <meta itemprop="name" content="Unknown"><span itemscope=""
                  itemtype="http://schema.org/ImageObject" itemprop="logo">
                  <meta itemprop="url"
                    content="https://via.placeholder.com/600x60/dbdbdb/4a4a4a.png?text=Unknown">
                </span>
              </span>
              <meta itemprop="image"
                content="https://via.placeholder.com/1200x714/dbdbdb/4a4a4a.png?text=Template-based%20automatic%20recognition%20of%20birdsong%20syllables%20from%20continuous%20recordings">
              <ul data-itemprop="identifiers">
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/doi"><span
                    itemprop="name">doi</span><span itemprop="value">10.1121/1.415968</span>
                </li>
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/pmid"><span
                    itemprop="name">pmid</span><span itemprop="value"
                    data-itemtype="http://schema.org/Number">8759970</span>
                </li>
              </ul>
            </li>
            <li itemscope="" itemtype="http://schema.org/Article" itemprop="citation" id="bib7">
              <ol data-itemprop="authors">
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="D Appeltants"><span
                    data-itemprop="givenNames"><span itemprop="givenName">D</span></span><span
                    data-itemprop="familyNames"><span itemprop="familyName">Appeltants</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="TQ Gentner"><span data-itemprop="givenNames"><span
                      itemprop="givenName">TQ</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Gentner</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="SH Hulse"><span data-itemprop="givenNames"><span
                      itemprop="givenName">SH</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Hulse</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="J Balthazart"><span
                    data-itemprop="givenNames"><span itemprop="givenName">J</span></span><span
                    data-itemprop="familyNames"><span itemprop="familyName">Balthazart</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="GF Ball"><span data-itemprop="givenNames"><span
                      itemprop="givenName">GF</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Ball</span></span>
                </li>
              </ol><time itemprop="datePublished" datetime="2005">2005</time><span
                itemprop="headline">The effect of auditory distractors on song discrimination in
                male canaries (Serinus canaria)</span><span itemscope=""
                itemtype="http://schema.org/PublicationVolume" itemprop="isPartOf"><span
                  itemprop="volumeNumber" data-itemtype="http://schema.org/Number">69</span><span
                  itemscope="" itemtype="http://schema.org/Periodical" itemprop="isPartOf"><span
                    itemprop="name">Behavioural Processes</span></span></span><span
                itemprop="pageStart" data-itemtype="http://schema.org/Number">331</span><span
                itemprop="pageEnd" data-itemtype="http://schema.org/Number">341</span><span
                itemscope="" itemtype="http://schema.org/Organization" itemprop="publisher">
                <meta itemprop="name" content="Unknown"><span itemscope=""
                  itemtype="http://schema.org/ImageObject" itemprop="logo">
                  <meta itemprop="url"
                    content="https://via.placeholder.com/600x60/dbdbdb/4a4a4a.png?text=Unknown">
                </span>
              </span>
              <meta itemprop="image"
                content="https://via.placeholder.com/1200x714/dbdbdb/4a4a4a.png?text=The%20effect%20of%20auditory%20distractors%20on%20song%20discrimination%20in%20male%20canaries%20(Serinus%20canaria)">
              <ul data-itemprop="identifiers">
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/doi"><span
                    itemprop="name">doi</span><span
                    itemprop="value">10.1016/j.beproc.2005.01.010</span>
                </li>
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/pmid"><span
                    itemprop="name">pmid</span><span itemprop="value"
                    data-itemtype="http://schema.org/Number">15896531</span>
                </li>
              </ul>
            </li>
            <li itemscope="" itemtype="http://schema.org/Article" itemprop="citation" id="bib8">
              <ol data-itemprop="authors">
                <li itemscope="" itemtype="http://schema.org/Organization" itemprop="author"><span
                    itemprop="name">Audacity Team</span></li>
              </ol><time itemprop="datePublished" datetime="2019">2019</time><a itemprop="url"
                href="https://www.audacityteam.org/2019/?p=t"><span
                  itemprop="headline">Audacity</span></a><span itemscope=""
                itemtype="http://schema.org/Organization" itemprop="publisher"><span
                  itemprop="name">Audacity</span><span itemscope=""
                  itemtype="http://schema.org/ImageObject" itemprop="logo">
                  <meta itemprop="url"
                    content="https://via.placeholder.com/600x60/dbdbdb/4a4a4a.png?text=Audacity">
                </span></span>
              <meta itemprop="image"
                content="https://via.placeholder.com/1200x714/dbdbdb/4a4a4a.png?text=Audacity">
            </li>
            <li itemscope="" itemtype="http://schema.org/Article" itemprop="citation" id="bib9">
              <ol data-itemprop="authors">
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="RC Berwick"><span data-itemprop="givenNames"><span
                      itemprop="givenName">RC</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Berwick</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="K Okanoya"><span data-itemprop="givenNames"><span
                      itemprop="givenName">K</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Okanoya</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="GJL Beckers"><span data-itemprop="givenNames"><span
                      itemprop="givenName">GJL</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Beckers</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="JJ Bolhuis"><span data-itemprop="givenNames"><span
                      itemprop="givenName">JJ</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Bolhuis</span></span>
                </li>
              </ol><time itemprop="datePublished" datetime="2011">2011</time><span
                itemprop="headline">Songs to syntax: the linguistics of birdsong</span><span
                itemscope="" itemtype="http://schema.org/PublicationVolume"
                itemprop="isPartOf"><span itemprop="volumeNumber"
                  data-itemtype="http://schema.org/Number">15</span><span itemscope=""
                  itemtype="http://schema.org/Periodical" itemprop="isPartOf"><span
                    itemprop="name">Trends in Cognitive Sciences</span></span></span><span
                itemprop="pageStart" data-itemtype="http://schema.org/Number">113</span><span
                itemprop="pageEnd" data-itemtype="http://schema.org/Number">121</span><span
                itemscope="" itemtype="http://schema.org/Organization" itemprop="publisher">
                <meta itemprop="name" content="Unknown"><span itemscope=""
                  itemtype="http://schema.org/ImageObject" itemprop="logo">
                  <meta itemprop="url"
                    content="https://via.placeholder.com/600x60/dbdbdb/4a4a4a.png?text=Unknown">
                </span>
              </span>
              <meta itemprop="image"
                content="https://via.placeholder.com/1200x714/dbdbdb/4a4a4a.png?text=Songs%20to%20syntax:%20the%20linguistics%20of%20birdsong">
              <ul data-itemprop="identifiers">
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/doi"><span
                    itemprop="name">doi</span><span
                    itemprop="value">10.1016/j.tics.2011.01.002</span>
                </li>
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/pmid"><span
                    itemprop="name">pmid</span><span itemprop="value"
                    data-itemtype="http://schema.org/Number">21296608</span>
                </li>
              </ul>
            </li>
            <li itemscope="" itemtype="http://schema.org/Article" itemprop="citation" id="bib10">
              <ol data-itemprop="authors">
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="S Böck"><span data-itemprop="givenNames"><span
                      itemprop="givenName">S</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Böck</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="M Schedl"><span data-itemprop="givenNames"><span
                      itemprop="givenName">M</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Schedl</span></span>
                </li>
              </ol><time itemprop="datePublished" datetime="2012">2012</time><span
                itemprop="headline">IEEE International Conference on Acoustics, Speech and Signal
                Processing (ICASSP)</span><span itemprop="pageStart"
                data-itemtype="http://schema.org/Number">121</span><span itemprop="pageEnd"
                data-itemtype="http://schema.org/Number">124</span><span itemscope=""
                itemtype="http://schema.org/Organization" itemprop="publisher">
                <meta itemprop="name" content="Unknown"><span itemscope=""
                  itemtype="http://schema.org/ImageObject" itemprop="logo">
                  <meta itemprop="url"
                    content="https://via.placeholder.com/600x60/dbdbdb/4a4a4a.png?text=Unknown">
                </span>
              </span>
              <meta itemprop="image"
                content="https://via.placeholder.com/1200x714/dbdbdb/4a4a4a.png?text=IEEE%20International%20Conference%20on%20Acoustics,%20Speech%20and%20Signal%20Processing%20(ICASSP)">
            </li>
            <li itemscope="" itemtype="http://schema.org/Article" itemprop="citation" id="bib11">
              <ol data-itemprop="authors">
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="P Boersma"><span data-itemprop="givenNames"><span
                      itemprop="givenName">P</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Boersma</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="D Weenink"><span data-itemprop="givenNames"><span
                      itemprop="givenName">D</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Weenink</span></span>
                </li>
              </ol><time itemprop="datePublished" datetime="2021">2021</time><a itemprop="url"
                href="https://www.fon.hum.uva.nl/praat/"><span itemprop="headline">Doing Phonetics
                  by Computer</span></a><span itemscope="" itemtype="http://schema.org/Organization"
                itemprop="publisher">
                <meta itemprop="name" content="Unknown"><span itemscope=""
                  itemtype="http://schema.org/ImageObject" itemprop="logo">
                  <meta itemprop="url"
                    content="https://via.placeholder.com/600x60/dbdbdb/4a4a4a.png?text=Unknown">
                </span>
              </span>
              <meta itemprop="image"
                content="https://via.placeholder.com/1200x714/dbdbdb/4a4a4a.png?text=Doing%20Phonetics%20by%20Computer">
            </li>
            <li itemscope="" itemtype="http://schema.org/Article" itemprop="citation" id="bib12">
              <ol data-itemprop="authors">
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="MS Brainard"><span data-itemprop="givenNames"><span
                      itemprop="givenName">MS</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Brainard</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="AJ Doupe"><span data-itemprop="givenNames"><span
                      itemprop="givenName">AJ</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Doupe</span></span>
                </li>
              </ol><time itemprop="datePublished" datetime="2002">2002</time><span
                itemprop="headline">What songbirds teach us about learning</span><span itemscope=""
                itemtype="http://schema.org/PublicationVolume" itemprop="isPartOf"><span
                  itemprop="volumeNumber" data-itemtype="http://schema.org/Number">417</span><span
                  itemscope="" itemtype="http://schema.org/Periodical" itemprop="isPartOf"><span
                    itemprop="name">Nature</span></span></span><span itemprop="pageStart"
                data-itemtype="http://schema.org/Number">351</span><span itemprop="pageEnd"
                data-itemtype="http://schema.org/Number">358</span><span itemscope=""
                itemtype="http://schema.org/Organization" itemprop="publisher">
                <meta itemprop="name" content="Unknown"><span itemscope=""
                  itemtype="http://schema.org/ImageObject" itemprop="logo">
                  <meta itemprop="url"
                    content="https://via.placeholder.com/600x60/dbdbdb/4a4a4a.png?text=Unknown">
                </span>
              </span>
              <meta itemprop="image"
                content="https://via.placeholder.com/1200x714/dbdbdb/4a4a4a.png?text=What%20songbirds%20teach%20us%20about%20learning">
              <ul data-itemprop="identifiers">
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/doi"><span
                    itemprop="name">doi</span><span itemprop="value">10.1038/417351a</span>
                </li>
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/pmid"><span
                    itemprop="name">pmid</span><span itemprop="value"
                    data-itemtype="http://schema.org/Number">12015616</span>
                </li>
              </ul>
            </li>
            <li itemscope="" itemtype="http://schema.org/Article" itemprop="citation" id="bib13">
              <ol data-itemprop="authors">
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="ZD Burkett"><span data-itemprop="givenNames"><span
                      itemprop="givenName">ZD</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Burkett</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="NF Day"><span data-itemprop="givenNames"><span
                      itemprop="givenName">NF</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Day</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="O Peñagarikano"><span
                    data-itemprop="givenNames"><span itemprop="givenName">O</span></span><span
                    data-itemprop="familyNames"><span
                      itemprop="familyName">Peñagarikano</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="DH Geschwind"><span
                    data-itemprop="givenNames"><span itemprop="givenName">DH</span></span><span
                    data-itemprop="familyNames"><span itemprop="familyName">Geschwind</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="SA White"><span data-itemprop="givenNames"><span
                      itemprop="givenName">SA</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">White</span></span>
                </li>
              </ol><time itemprop="datePublished" datetime="2015">2015</time><span
                itemprop="headline">VoICE: A semi-automated pipeline for standardizing vocal
                analysis across models</span><span itemscope=""
                itemtype="http://schema.org/PublicationVolume" itemprop="isPartOf"><span
                  itemprop="volumeNumber" data-itemtype="http://schema.org/Number">5</span><span
                  itemscope="" itemtype="http://schema.org/Periodical" itemprop="isPartOf"><span
                    itemprop="name">Scientific Reports</span></span></span><span itemscope=""
                itemtype="http://schema.org/Organization" itemprop="publisher">
                <meta itemprop="name" content="Unknown"><span itemscope=""
                  itemtype="http://schema.org/ImageObject" itemprop="logo">
                  <meta itemprop="url"
                    content="https://via.placeholder.com/600x60/dbdbdb/4a4a4a.png?text=Unknown">
                </span>
              </span>
              <meta itemprop="image"
                content="https://via.placeholder.com/1200x714/dbdbdb/4a4a4a.png?text=VoICE:%20A%20semi-automated%20pipeline%20for%20standardizing%20vocal%20analysis%20across%20models">
              <ul data-itemprop="identifiers">
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/doi"><span
                    itemprop="name">doi</span><span itemprop="value">10.1038/srep10237</span>
                </li>
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/pmid"><span
                    itemprop="name">pmid</span><span itemprop="value"
                    data-itemtype="http://schema.org/Number">26018425</span>
                </li>
              </ul>
            </li>
            <li itemscope="" itemtype="http://schema.org/Article" itemprop="citation" id="bib14">
              <ol data-itemprop="authors">
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="TA Caswell"><span data-itemprop="givenNames"><span
                      itemprop="givenName">TA</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Caswell</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="M Droettboom"><span
                    data-itemprop="givenNames"><span itemprop="givenName">M</span></span><span
                    data-itemprop="familyNames"><span itemprop="familyName">Droettboom</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="A Lee"><span data-itemprop="givenNames"><span
                      itemprop="givenName">A</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Lee</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="ES de Andrade"><span
                    data-itemprop="givenNames"><span itemprop="givenName">ES</span></span><span
                    data-itemprop="familyNames"><span itemprop="familyName">de</span><span
                      itemprop="familyName">Andrade</span></span>
                </li>
              </ol><time itemprop="datePublished" datetime="2020">2020</time><a itemprop="url"
                href="https://doi.org/10.5281/zenodo/matplotlib.org/"><span
                  itemprop="headline">Matplotlib/Matplotlib: REL</span></a><span itemscope=""
                itemtype="http://schema.org/Organization" itemprop="publisher">
                <meta itemprop="name" content="Unknown"><span itemscope=""
                  itemtype="http://schema.org/ImageObject" itemprop="logo">
                  <meta itemprop="url"
                    content="https://via.placeholder.com/600x60/dbdbdb/4a4a4a.png?text=Unknown">
                </span>
              </span>
              <meta itemprop="image"
                content="https://via.placeholder.com/1200x714/dbdbdb/4a4a4a.png?text=Matplotlib/Matplotlib:%20REL">
            </li>
            <li itemscope="" itemtype="http://schema.org/Article" itemprop="citation" id="bib15">
              <ol data-itemprop="authors">
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="T Chen"><span data-itemprop="givenNames"><span
                      itemprop="givenName">T</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Chen</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="C Guestrin"><span data-itemprop="givenNames"><span
                      itemprop="givenName">C</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Guestrin</span></span>
                </li>
              </ol><time itemprop="datePublished" datetime="2016">2016</time><span
                itemprop="headline">In Proceedings of the 22nd ACM SIGKDD International Conference
                on Knowledge Discovery and Data Mining, KDD ’16</span><span itemprop="pageStart"
                data-itemtype="http://schema.org/Number">785</span><span itemprop="pageEnd"
                data-itemtype="http://schema.org/Number">794</span><span itemscope=""
                itemtype="http://schema.org/Organization" itemprop="publisher">
                <meta itemprop="name" content="Unknown"><span itemscope=""
                  itemtype="http://schema.org/ImageObject" itemprop="logo">
                  <meta itemprop="url"
                    content="https://via.placeholder.com/600x60/dbdbdb/4a4a4a.png?text=Unknown">
                </span>
              </span>
              <meta itemprop="image"
                content="https://via.placeholder.com/1200x714/dbdbdb/4a4a4a.png?text=In%20Proceedings%20of%20the%2022nd%20ACM%20SIGKDD%20International%20Conference%20on%20Knowledge%20Discovery%20and%20Data%20Mining,%20KDD%20%E2%80%9916">
              <ul data-itemprop="identifiers">
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/doi"><span
                    itemprop="name">doi</span><span itemprop="value">10.1145/2939672.2939785</span>
                </li>
              </ul>
            </li>
            <li itemscope="" itemtype="http://schema.org/Article" itemprop="citation" id="bib16">
              <ol data-itemprop="authors">
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="KR Coffey"><span data-itemprop="givenNames"><span
                      itemprop="givenName">KR</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Coffey</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="RG Marx"><span data-itemprop="givenNames"><span
                      itemprop="givenName">RG</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Marx</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="JF Neumaier"><span data-itemprop="givenNames"><span
                      itemprop="givenName">JF</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Neumaier</span></span>
                </li>
              </ol><time itemprop="datePublished" datetime="2019">2019</time><span
                itemprop="headline">DeepSqueak: a deep learning-based system for detection and
                analysis of ultrasonic vocalizations</span><span itemscope=""
                itemtype="http://schema.org/PublicationVolume" itemprop="isPartOf"><span
                  itemprop="volumeNumber" data-itemtype="http://schema.org/Number">44</span><span
                  itemscope="" itemtype="http://schema.org/Periodical" itemprop="isPartOf"><span
                    itemprop="name">Neuropsychopharmacology : Official Publication of the American
                    College of Neuropsychopharmacology</span></span></span><span
                itemprop="pageStart" data-itemtype="http://schema.org/Number">859</span><span
                itemprop="pageEnd" data-itemtype="http://schema.org/Number">868</span><span
                itemscope="" itemtype="http://schema.org/Organization" itemprop="publisher">
                <meta itemprop="name" content="Unknown"><span itemscope=""
                  itemtype="http://schema.org/ImageObject" itemprop="logo">
                  <meta itemprop="url"
                    content="https://via.placeholder.com/600x60/dbdbdb/4a4a4a.png?text=Unknown">
                </span>
              </span>
              <meta itemprop="image"
                content="https://via.placeholder.com/1200x714/dbdbdb/4a4a4a.png?text=DeepSqueak:%20a%20deep%20learning-based%20system%20for%20detection%20and%20analysis%20of%20ultrasonic%20vocalizations">
              <ul data-itemprop="identifiers">
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/doi"><span
                    itemprop="name">doi</span><span
                    itemprop="value">10.1038/s41386-018-0303-6</span>
                </li>
                <li itemscope="" itemtype="http://schema.org/PropertyValue" itemprop="identifier">
                  <meta itemprop="propertyID"
                    content="https://registry.identifiers.org/registry/pmid"><span
                    itemprop="name">pmid</span><span itemprop="value"
                    data-itemtype="http://schema.org/Number">30610191</span>
                </li>
              </ul>
            </li>
            <li itemscope="" itemtype="http://schema.org/Article" itemprop="citation" id="bib17">
              <ol data-itemprop="authors">
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="Y Cohen"><span data-itemprop="givenNames"><span
                      itemprop="givenName">Y</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Cohen</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="J Shen"><span data-itemprop="givenNames"><span
                      itemprop="givenName">J</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Shen</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="D Semu"><span data-itemprop="givenNames"><span
                      itemprop="givenName">D</span></span><span data-itemprop="familyNames"><span
                      itemprop="familyName">Semu</span></span>
                </li>
                <li itemscope="" itemtype="http://schema.org/Person" itemprop="author">
                  <meta itemprop="name" content="DP Leman"><span data-itemprop="givenNames"><span
                      itemprop="givenName">DP</span></span><span data-itemprop="familyNames"><