import React from 'react'
import ReactMarkdown from 'react-markdown'
import { HashLink } from 'react-router-hash-link';
import Site from '../site'

import { Audio, Download, Image, Table } from '../sitecomponents'
import WavTable from '../wavtable'

import '../css/site.css'


/*******************************************************************************
Constants
*******************************************************************************/


const NAME = 'promonet';


/*******************************************************************************
ProMoNet site
*******************************************************************************/


export default function ProMoNet() {
  return (
    <Site name={NAME}>
      <Overview/>
      <TableOfContents/>
      <Reconstruction/>
      <PitchShifting/>
      <TimeStretching/>
      <VolumeTimbre/>
      <SpectralBalance/>
      <SpeakerAdaptation/>
      <VoiceConversion/>
      <AccentConversion />
      <SingingTransfer />
      <Onomatopoeia />
      <DummyCitation />
    </Site>);
}


/*******************************************************************************
Site structure
*******************************************************************************/


function DummyCitation() {
  return (<section id='citation'></section>);
}


function TableOfContents() {
  let location = '/sites/promonet';
  return (
    <div className='section'>
      <ReactMarkdown>{`## Table of Contents`}</ReactMarkdown>
      <ul style={{ marginBottom: 0 }}>
        <li>
          <HashLink to={`${location}/#reconstruction`}>
            Speech reconstruction (Section 5.3)
          </HashLink>
        </li>
        <li>
          <HashLink to={`${location}/#pitch-shifting`}>
            Pitch-shifting (Section 5.4)
          </HashLink>
        </li>
        <li>
          <HashLink to={`${location}/#time-stretching`}>
            Time-stretching (Section 5.4)
          </HashLink>
        </li>
        <li>
          <HashLink to={`${location}/#volume-timbre`}>
            Timbral correlates of volume (Sections 3; 5.5)
          </HashLink>
        </li>
        <li>
          <HashLink to={`${location}/#spectral-balance`}>
            Spectral balance (Sections 3; 5.5)
          </HashLink>
        </li>
        <li>
          <HashLink to={`${location}/#adaptation`}>
            Speaker adaptation
          </HashLink>
        </li>
        <li>
          <HashLink to={`${location}/#voice-conversion`}>
            Voice conversion
          </HashLink>
        </li>
        <li>
          <HashLink to={`${location}/#accent-conversion`}>
            Accent conversion
          </HashLink>
        </li>
        <li>
          <HashLink to={`${location}/#singing-transfer`}>
            Singing voice transfer
          </HashLink>
        </li>
        <li>
          <HashLink to={`${location}/#automatic-onomatopoeia`}>
            Automatic onomatopoeia
          </HashLink>
        </li>
        <li>
          <HashLink to={`${location}/#citation`}>
            How to cite this work
          </HashLink>
        </li>
      </ul>
    </div>
  );
}


function AccentConversion() {
  let markdown = `## Accent conversion`;
  return (
    <div className='section'>
      <section id='accent-conversion'>
        <ReactMarkdown>{markdown}</ReactMarkdown>
        <Image
          name={NAME}
          file={`accent.png`}
          alt={'Accent conversion'}
          caption={'We demonstrate regex-based accent conversion by encode an example speech utterance in my proposed representation, performing edits to the PPG, and synthesize speech with modified PPGs. We specify edits to the PPG that convert from the original, South African accent to an American Midwestern accent. My sequential rule set for this example is as follows: (1) ``reallocate``(["dh", "ah"], ["th", "ah"]), (2) ``reallocate``(["n", "aa", "t"], ["n", "ah", "t"]), (3) ``reallocate``("er", "r"), (4) ``reallocate``("ae", "eh"),  where ``reallocate``(*b*, *c*) reallocates all probability in regex matches for phoneme sequence *b* to corresponding phonemes in phoneme sequence *c*. Blue PPGs are inputs and red PPGs are inferred from synthesized audio. For all other features, green/red is ground truth, black is inferred from reconstructed audio, and red indicates a pitch error of at least 50¢ in voiced regions, a periodicity error of at least 0.1, or a loudness error of at least 6 dB.'}
          width={600}
        />
        <WavTable
          name={NAME}
          experiment={'accent'}
          extension={'mp3'}
          columns={['Input', 'Output']}
          rows={['_']}
        />
      </section>
    </div>
  );
}


function SpectralBalance() {
  let markdown = `## Spectral balance (Sections 3; 5.5)`;
  return (
    <div className='section'>
      <section id='spectral-balance'>
        <ReactMarkdown>{markdown}</ReactMarkdown>
        <Image
          name={NAME}
          file={`formant.png`}
          alt={'Harmonic estimation'}
          caption={'Estimation of harmonics using our proposed Viterbi-based estimation method'}
          width={750}
        />
        <WavTable
          name={NAME}
          experiment={'formant-range'}
          extension={'mp3'}
          columns={[
            'r_f = 0.50',
            'r_f = 0.71',
            'Original',
            'r_f = 1.41',
            'r_f = 2.00'
          ]}
          rows={['_']}
        />
        <WavTable
          name={NAME}
          experiment={'formant'}
          extension={'mp3'}
          columns={[
            'Original',
            'Proposed (r_f = 0.71)',
            'Proposed (r_f = 1.41)'
          ]}
          rows={['Female 1', 'Male 1', 'Female 2', 'Male 2']}
          transpose={true}
        />
      </section>
    </div>
  );
}


function Onomatopoeia() {
  let markdown = `## Automatic onomatopoeia\n` +
  `We demonstrate that our proposed representation is not limited to human speech by ` +
  `encoding a recording of a cat meowing in our proposed representation and ` +
  `synthesizing human speech from the representation of cat meows. We compare to ` +
  `prior work by Churchwell et al. that uses only (non-sparsified) PPGs and pitch ` +
  `(without Viterbi decoding).`;
  return (
    <div className='section'>
      <section id='automatic-onomatopoeia'>
        <ReactMarkdown>{markdown}</ReactMarkdown>
        <Image
          name={NAME}
          file={`cat.png`}
          alt={'Automatic onomatopoeia'}
          caption={'Our proposed representation applied to a cat meowing and speech synthesized from the representation of cat meows. Blue PPGs are inputs and red PPGs are inferred from synthesized audio. For all other features, green/red is ground truth, black is inferred from reconstructed audio, and red indicates a pitch error of at least 50¢ in voiced regions, a periodicity error of at least 0.1, or a loudness error of at least 6 dB.'}
          width={600}
        />
        <WavTable
          name={NAME}
          experiment={'cat'}
          extension={'mp3'}
          columns={['Source', 'Target', 'Churchwell et al. (2024)', 'Proposed']}
          rows={['_']}
        />
        <Image
          name={NAME}
          file={`dog.png`}
          alt={'Automatic onomatopoeia'}
          caption={'Our proposed representation applied to a dog barking and speech synthesized from the representation of dog barks. Blue PPGs are inputs and red PPGs are inferred from synthesized audio. For all other features, green/red is ground truth, black is inferred from reconstructed audio, and red indicates a pitch error of at least 50¢ in voiced regions, a periodicity error of at least 0.1, or a loudness error of at least 6 dB.'}
          width={600}
        />
        <WavTable
          name={NAME}
          experiment={'dog'}
          extension={'mp3'}
          columns={['Source', 'Target', 'Proposed']}
          rows={['_']}
        />
      </section>
    </div>
  );
}


function Overview() {
  return (
    <div className='section'>
      <section id='overview'>
        <Image
          name={NAME}
          file={`hero.png`}
          alt={'A diagram depicting the system flow of our proposed method for representating, editing, and resynthesizing speech.'}
          width={900}
          caption={'Overview of our proposed method. We extract our interpretable, invertible, and disentangled representation from a speech recording and synthesize speech (including any user edits) using an off-the-shelf neural vocoder.'}
        />
      </section>
    </div>
  )
}


function VolumeTimbre() {
  let markdown = `## Timbral correlates of volume (Sections 3; 5.5)\n` +
  ``;

  return (
    <div className='section'>
      <section id='volume-timbre'>
        <ReactMarkdown>{markdown}</ReactMarkdown>
        <Image
          name={NAME}
          file={`loudness-range.png`}
          alt={'Input and output A-weighted loudness curves depicting volume editing accuracy'}
          width={900}
          caption={'Input A-weighted loudness (solid) and A-weighted loudness of reconstructed speech (dashed) for jointly increasing the timbral correlates of volume with volume shifts of ±5 and ±10 dBA. Note that most of the reconstruction error is in silent frames (i.e., the noise floor is not reconstructed).'}
        />
        <WavTable
          name={NAME}
          experiment={'loudness-range'}
          extension={'mp3'}
          columns={['-10 dBA', '-5 dBA', 'Original', '+5 dBA', '+10 dBA']}
          rows={['_']}
        />
        <WavTable
          name={NAME}
          experiment={'loudness'}
          extension={'mp3'}
          columns={[
            'Original',
            'Proposed (-10 dBA)',
            'Proposed (+10 dBA)',
            'No augment (-10 dBA)',
            'No augment (+10 dBA)',
            'r_l = 0.5',
            'r_l = 2.0'
          ]}
          rows={['Female 1', 'Male 1', 'Female 2', 'Male 2']}
          transpose={true}
        />
      </section>
    </div>
  );
}


function PitchShifting() {
  let markdown = `## Pitch-shifting (Section 5.4)`;
  return (
    <div className='section'>
      <section id='pitch-shifting'>
        <ReactMarkdown>{markdown}</ReactMarkdown>
        <Image
          name={NAME}
          file={`pitch-range.png`}
          alt={'Input and output pitch contours depicting pitch-shifting accuracy'}
          width={900}
          caption={'Input pitch contours (solid) and pitch of reconstructed speech (dashed) for pitch-shifting up or down by up to one octave (1200¢). Low-frequency reconstruction inaccuracy is primarily due to our model having a strict minimum pitch of 50 Hz. This minimum is straightforward to change.'}
        />
        <WavTable
          name={NAME}
          experiment={'pitch-range'}
          extension={'mp3'}
          columns={[
            '-1200¢',
            '-600¢',
            'Original',
            '+600¢',
            '+1200¢',
          ]}
          rows={['_']}
        />
        <WavTable
          name={NAME}
          experiment={'pitch-shifting'}
          extension={'mp3'}
          columns={[
            'Original',
            'Proposed (-600¢)',
            'Proposed (+600¢)',
            'TD-PSOLA (-600¢)',
            'TD-PSOLA (+600¢)',
            'WORLD (-600¢)',
            'WORLD (+600¢)'
          ]}
          rows={['Female 1', 'Male 1', 'Female 2', 'Male 2']}
          transpose={true}
        />
      </section>
    </div>
  );
}


function Reconstruction() {
  let markdown = `## Reconstruction (Section 5.3)\n` +
  `We compare speech reconstruction using our proposed representation to an ` +
  `equivalent system that uses Mel spectrograms as the input representation ` +
  `to demonstrate that our representation reconstructs speech roughly as ` +
  `well as Mel spectrograms. ` +
  `Examples correspond to the subjective evaluation described in Section 5.3 ` +
  `and Table 1 (Reconstruction). The rest of this website ` +
  `demonstrates edits that are difficult or impossible to perform with a Mel ` +
  `spectrogram.`;
  let elements = ['Female 1', 'Male 1', 'Female 2', 'Male 2'].map((item) =>
    <div className='section'>
      <Image
        name={NAME}
        file={`reconstruction/${item}.png`}
        alt={'Our proposed representation before and after reconstruction'}
        width={600}
        caption={'Our proposed representation before and after reconstruction. Blue PPGs are inputs and red PPGs are inferred from reconstructed audio. For all other features, green/red is ground truth, black is inferred from reconstructed audio, and red indicates a pitch error of at least 50¢ in voiced regions, a periodicity error of at least 0.1, or a loudness error of at least 6 dB.'}
      />
      <WavTable
        name={NAME}
        experiment={'reconstruction'}
        extension={'mp3'}
        columns={['Original', 'Proposed', 'Mels']}
        rows={[item]}
        transpose={true}
      />
    </div>
  );
  return (
    <div className='section'>
      <section id='reconstruction'>
        <ReactMarkdown>{markdown}</ReactMarkdown>
        {elements}
      </section>
    </div>
  );
}


function SingingTransfer() {
  let markdown = `## Singing voice transfer`;
  return (
    <div className='section'>
      <section id='singing-transfer'>
        <ReactMarkdown>{markdown}</ReactMarkdown>
        <Image
          name={NAME}
          file={`male.png`}
          alt={'Our proposed representation before and after singing voice conversion'}
          width={600}
          caption={'Our proposed representation before and after singing voice conversion. Blue PPGs are inputs and red PPGs are inferred from reconstructed audio. For all other features, green/red is ground truth, black is inferred from reconstructed audio, and red indicates a pitch error of at least 50¢ in voiced regions, a periodicity error of at least 0.1, or a loudness error of at least 6 dB.'}
          />
        <WavTable
          name={NAME}
          experiment={'male-singing'}
          extension={'mp3'}
          columns={['Source', 'Target', 'Proposed']}
          rows={['_']}
        />
        <Image
          name={NAME}
          file={`female.png`}
          alt={'Our proposed representation before and after singing voice conversion'}
          width={600}
          caption={'Our proposed representation before and after singing voice conversion. Blue PPGs are inputs and red PPGs are inferred from reconstructed audio. For all other features, green/red is ground truth, black is inferred from reconstructed audio, and red indicates a pitch error of at least 50¢ in voiced regions, a periodicity error of at least 0.1, or a loudness error of at least 6 dB.'}
        />
        <WavTable
          name={NAME}
          experiment={'female-singing'}
          extension={'mp3'}
          columns={['Source', 'Target', 'Proposed']}
          rows={['_']}
        />
      </section>
    </div>
  );
}


function SpeakerAdaptation() {
  let markdown = `## Speaker adaptation`;
  let elements = ['Female 1', 'Male 1', 'Female 2', 'Male 2'].map((item) =>
    <>
      <Image
        name={NAME}
        file={`adaptation/${item}.png`}
        alt={'Our proposed representation before and after reconstruction on the DAPS dataset'}
        caption={'Our proposed representation before and after reconstruction when adapted to a speaker in the DAPS dataset. Blue PPGs are inputs and red PPGs are inferred from reconstructed audio. For all other features, green/red is ground truth, black is inferred from reconstructed audio, and red indicates a pitch error of at least 50¢ in voiced regions, a periodicity error of at least 0.1, or a loudness error of at least 6 dB.'}
        width={600}
      />
      <WavTable
        name={NAME}
        experiment={'adaptation'}
        extension={'mp3'}
        columns={['Original', 'Proposed']}
        rows={[item]}
        transpose={true}
      />
    </>
  );
  return (
    <div className='section'>
      <section id='adaptation'>
        <ReactMarkdown>{markdown}</ReactMarkdown>
        {elements}
      </section>
    </div>
  );
}


function TimeStretching() {
  let markdown = `## Time-stretching (Section 5.4)`;
  let tableDescription = `Objective evaluation of time-stretching was ` +
    `omitted from our paper due to page limits and is provided below.`;
  return (
    <div className='section'>
      <section id='time-stretching'>
        <ReactMarkdown>{markdown}</ReactMarkdown>
        <WavTable
          name={NAME}
          experiment={'time-range'}
          extension={'mp3'}
          columns={[
            '0.50x',
            '0.71x',
            'Original',
            '1.41x',
            '2.00x'
          ]}
          rows={['_']}
        />
        <WavTable
          name={NAME}
          experiment={'time-stretching'}
          extension={'mp3'}
          columns={[
            'Original',
            'Proposed (0.71x)',
            'Proposed (1.41x)',
            'TD-PSOLA (0.71x)',
            'TD-PSOLA (1.41x)',
            'WORLD (0.71x)',
            'WORLD (1.41x)'
            ]}
          rows={['Female 1', 'Male 1', 'Female 2', 'Male 2']}
          transpose={true}
        />
        <br/>
        <div style={{'textAlign': 'center'}}>{tableDescription}</div>
        <Table
          data={[
            { 'Method': 'Proposed', '∆¢': '20.4', '∆φ': '.066', '∆dBA': '1.29', '∆PPG': '.195', 'Subjective': '64.0' },
            { 'Method': 'TD-PSOLA', '∆¢': '22.0', '∆φ': '.062', '∆dBA': '1.65', '∆PPG': '.189', 'Subjective': '63.3 ± 1.71' },
            { 'Method': 'WORLD', '∆¢': '18.2', '∆φ': '.103', '∆dBA': '4.48', '∆PPG': '.473', 'Subjective': '46.5 ± 2.20' },
          ]}
        />
      </section>
    </div>
  );
}


function VoiceConversion() {
  let markdown = `## Voice conversion`;
  return (
    <div className='section'>
      <section id='voice-conversion'>
        <ReactMarkdown>{markdown}</ReactMarkdown>
        <WavTable
          name={NAME}
          experiment={'voice-conversion'}
          extension={'mp3'}
          columns={['Source', 'Target', 'Proposed']}
          rows={[
            'Female → Male',
            'Female → Female',
            'Male → Female',
            'Male → Male'
          ]}
          transpose={true}
        />
      </section>
    </div>
  );
}
