StarVC/index.html at main · thuhcsi/StarVC · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
<!DOCTYPE html>
<html lang="en-US" xmlns="http://www.w3.org/1999/html">

<head>
  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  <!-- Begin Jekyll SEO tag v2.7.1 -->
  <title>StarVC: A Unified Auto-Regressive Framework for Joint Text and Speech Generation in Voice Conversion</title>
  <meta name="generator" content="Jekyll v3.9.0">
  <meta property="og:title" content="TODO: title">
  <meta property="og:locale" content="en_US">
  <meta name="twitter:card" content="summary">
  <!-- End Jekyll SEO tag -->

  <meta name="viewport" content="width=device-width, initial-scale=1">
  <meta name="theme-color" content="#157878">
  <link rel="stylesheet" href="style.css">

  <style>
    /* 自定义表格样式：统一设置边框、内边距、文字居中 */
    table.center-table {
      border-collapse: collapse;  /* 合并边框 */
      margin: 0 auto;            /* 整个表格居中 */
      width: 80%;                /* 可根据需要调整表格宽度 */
    }
    table.center-table th,
    table.center-table td {
      /*border: 1px solid #ccc; */
      padding: 5px;              /* 内边距 */
      text-align: center;        /* 文本居中 */
    }
  </style>
</head>

<body>
<section class="page-header">
  <!-- <h1 class="project-name">Demo PAGE</h1> -->
  <!-- <h2 class="project-tagline"></h2> -->
</section>

<section class="main-content">
  <h1 id="">
    <center>StarVC: A Unified Auto-Regressive Framework for Joint Text and Speech Generation in Voice Conversion</center>
  </h1>

  <br><br>
  <h2 id="abstract">1. Abstract</h2>
  <p>
    Voice Conversion (VC) modifies speech to match a target speaker while preserving linguistic content.
    Traditional methods usually extract speaker information directly from speech while neglecting the explicit utilization of linguistic content. Since VC fundamentally involves disentangling speaker identity from linguistic content, leveraging structured semantic features could enhance conversion performance. However, previous attempts to incorporate semantic features into VC have shown limited effectiveness, motivating the integration of explicit text modeling.
    We propose <strong>StarVC</strong>, a unified autoregressive VC framework that first predicts text tokens before synthesizing acoustic features. The experiments demonstrate that StarVC outperforms conventional VC methods in preserving both linguistic content (i.e., WER) and speaker characteristics (i.e., SECS and MOS).
  </p>

  <br>
  <table class="center-table" frame="void" rules="none">
    <tr>
      <center><img src='raw/fig/fig_1.png'></center>
    </tr>
  </table>
  <br><br>

  <h2>2. Computational Metrics</h2>
  <h3>Table 1: Objective Evaluation of StarVC and Baselines (including Ablations)</h3>
  <p>
    SECS-Res and SECS-Wavlm are SECS metrics from Resemblyzer and a fine-tuned WavLM, respectively.
    <br>
    WER-Text and CER-Text evaluate the text generated by StarVC at word-level and character-level accuracy.
    <br>
    <strong>Bold</strong> values indicate the best results, and <u>underlined</u> values indicate the second-best results.
  </p>
  <!-- 这里是改动的重点：使用自定义样式 "center-table" 来让所有文本居中 -->
  <table class="center-table">
    <thead>
    <tr>
      <th>Model</th>
      <th>SECS-Res &uarr;</th>
      <th>SECS-Wavlm &uarr;</th>
      <th>WER &darr;</th>
      <th>WER-Text &darr;</th>
      <th>CER &darr;</th>
      <th>CER-Text &darr;</th>
    </tr>
    </thead>
    <tbody>
    <!-- 第一段（对照 LaTeX \midrule 之前） -->
    <tr>
      <td>CosyVoice</td>
      <td><strong>0.839</strong></td>
      <td><strong>0.478</strong></td>
      <td>8.24%</td>
      <td>/</td>
      <td>4.27%</td>
      <td>/</td>
    </tr>
    <tr>
      <td>OpenVoice V2</td>
      <td>0.771</td>
      <td>0.284</td>
      <td>8.17%</td>
      <td>/</td>
      <td><u>4.15%</u></td>
      <td>/</td>
    </tr>
    <tr>
      <td>TriAAN-VC</td>
      <td>0.756</td>
      <td>0.241</td>
      <td>19.67%</td>
      <td>/</td>
      <td>12.18%</td>
      <td>/</td>
    </tr>
    <!-- 第二段（对照 LaTeX \midrule 后） -->
    <tr>
      <td><strong>StarVC</strong></td>
      <td><u>0.835</u></td>
      <td><u>0.472</u></td>
      <td><strong>6.27%</strong></td>
      <td><strong>4.95%</strong></td>
      <td><strong>4.09%</strong></td>
      <td><strong>1.51%</strong></td>
    </tr>
    <tr>
      <td>-- w/o multi-stage</td>
      <td>0.812</td>
      <td>0.429</td>
      <td><u>7.24%</u></td>
      <td><u>5.09%</u></td>
      <td>4.60%</td>
      <td>1.61%</td>
    </tr>
    <tr>
      <td>-- w/o text token</td>
      <td>0.771</td>
      <td>0.382</td>
      <td>7.30%</td>
      <td>/</td>
      <td>4.31%</td>
      <td>/</td>
    </tr>
    <tr>
      <td>-- smaller model</td>
      <td>0.750</td>
      <td>0.383</td>
      <td>8.04%</td>
      <td>5.33%</td>
      <td>5.67%</td>
      <td><u>1.56%</u></td>
    </tr>
    </tbody>
  </table>

  <!-- 如果想用个小标题，可以先加上一行，如： -->
  <h3>Table 2: MOS Evaluation of StarVC and Baselines with 95% confidence interval</h3>
  <p>
    <strong>Bold</strong> values indicate the best results, and <u>underlined</u> values indicate the second-best results.
  </p>

  <!-- 表格开始 -->
  <table class="center-table">
    <thead>
    <tr>
      <th>Model</th>
      <th>SMOS &uarr;</th>
      <th>NMOS &uarr;</th>
    </tr>
    </thead>
    <tbody>
    <tr>
      <td>CosyVoice</td>
      <td>3.94 ± 0.09</td>
      <td><u>4.15 ± 0.08</u></td>
    </tr>
    <tr>
      <td>OpenVoice V2</td>
      <td><u>3.97 ± 0.08</u></td>
      <td>4.09 ± 0.08</td>
    </tr>
    <tr>
      <td>TriAAN-VC</td>
      <td>3.25 ± 0.09</td>
      <td>3.06 ± 0.10</td>
    </tr>
    <tr>
      <td><strong>StarVC</strong></td>
      <td><strong>3.98 ± 0.08</strong></td>
      <td><strong>4.17 ± 0.08</strong></td>
    </tr>
    </tbody>
  </table>


  <h2>3. Demo<a name="Comparison"></a></h2>
  <ul>
    <li>TriAAN-VC: A deep-learning-based framework for any-to-any VC, focusing on disentangling linguistic content and target speaker characteristics.[1]</li>
    <li>OpenVoice V2: A large-scale, zero-shot VC model built upon YourTTS.[2]</li>
    <li>CosyVoice: A diffusion-based speech generation approach from Alibaba, featuring a VC variant.[3]</li>
  </ul>

  <!-- 这里是展示音频对比的表格，会由脚本自动填充 -->
  <table class="center-table" style="border-collapse: collapse; width: 100%;">
    <tbody id="tbody"></tbody>
  </table>

  <br>
  <cite>[1] H. J. Park, S. W. Yang, J. S. Kim, W. Shin, and S. W. Han, “Triaan-vc: Triple adaptive attention normalization for any-to-any voice conversion,” in ICASSP 2023-2023 IEEE International Confer-ence on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2023, pp. 1–5. </cite>
  <br>
  <cite>[2] Z. Qin, W. Zhao, X. Yu, and X. Sun, “Openvoice: Versatile instant voice cloning,” arXiv preprint arXiv:2312.01479, 2023.</cite>
  <br>
  <cite>[3] Z. Du, Q. Chen, S. Zhang, K. Hu, H. Lu, Y. Yang, H. Hu,S. Zheng, Y. Gu, Z. Ma et al., “Cosyvoice: A scalable multi-lingual zero-shot text-to-speech synthesizer based on supervised semantic tokens,” arXiv preprint arXiv:2407.05407, 2024. </cite>
</section>
</body>

<script type="text/javascript">
  window.onload = function () {
    /*
     * 前 8 行: Same-Gender Conversion
     * 后 7 行: Diff-Gender Conversion
     * 每行 6 条音频：
     *   1) Source Speech
     *   2) Target Speaker
     *   3) TriAAN-vc
     *   4) OpenVoice V2
     *   5) CosyVoice
     *   6) StarVC
     */

    const sameGenderRows = 8;
    const diffGenderRows = 7;

    // 用于存放生成的表格 HTML
    let sample_data = `
      <!-- 表头（共7列：Conversion Type + 6列音频） -->
      <tr>
        <th style="width: 150px;">Conversion Type</th>
        <th style="width: 150px;">Source Speech</th>
        <th style="width: 150px;">Target Speaker</th>
        <th style="width: 150px;">TriAAN-vc</th>
        <th style="width: 150px;">OpenVoice V2</th>
        <th style="width: 150px;">CosyVoice</th>
        <th style="width: 150px;">StarVC</th>
      </tr>
    `;

    /*
      ========== 同性别转换 (Same-Gender) ==========
      第一行先放 Conversion Type: "Same-Gender"
      并设置 rowspan=8
    */
    sample_data += `
      <tr>
        <!-- 左侧的合并单元格，放 8 行 -->
        <td rowspan="${sameGenderRows}" style="font-weight: bold; ">
          Same Gender
        </td>
        <!-- 第1行: 同性别(1) 的 6 列音频 -->
        <td><audio style="width: 100px;" controls src="raw/samples/same_gender/source/1.wav"></audio></td>
        <td><audio style="width: 100px;" controls src="raw/samples/same_gender/target/1.wav"></audio></td>
        <td><audio style="width: 100px;" controls src="raw/samples/same_gender/triaan_vc/1.wav"></audio></td>
        <td><audio style="width: 100px;" controls src="raw/samples/same_gender/openvoice/1.wav"></audio></td>
        <td><audio style="width: 100px;" controls src="raw/samples/same_gender/cosyvoice/1.wav"></audio></td>
        <td><audio style="width: 100px;" controls src="raw/samples/same_gender/starvc/1.wav"></audio></td>
      </tr>
    `;

    // 后面第 2 ~ 8 行就只需 6 列音频，没有左侧的 Conversion Type
    for (let i = 2; i <= sameGenderRows; i++) {
      sample_data += `
        <tr>
          <td><audio style="width: 100px;" controls src="raw/samples/same_gender/source/${i}.wav"></audio></td>
          <td><audio style="width: 100px;" controls src="raw/samples/same_gender/target/${i}.wav"></audio></td>
          <td><audio style="width: 100px;" controls src="raw/samples/same_gender/triaan_vc/${i}.wav"></audio></td>
          <td><audio style="width: 100px;" controls src="raw/samples/same_gender/openvoice/${i}.wav"></audio></td>
          <td><audio style="width: 100px;" controls src="raw/samples/same_gender/cosyvoice/${i}.wav"></audio></td>
          <td><audio style="width: 100px;" controls src="raw/samples/same_gender/starvc/${i}.wav"></audio></td>
        </tr>
      `;
    }

    /*
      ========== 异性别转换 (Diff-Gender) ==========
      与上面类似，
      第一行放 Conversion Type: "Diff-Gender" 并设置 rowspan=7
    */
    sample_data += `
      <tr>
        <!-- 左侧合并单元格，放 7 行 -->
        <td rowspan="${diffGenderRows}" style="font-weight: bold; ">
          Cross Gender
        </td>
        <!-- 第1行: 异性别(1) 的 6 列音频 -->
        <td><audio style="width: 100px;" controls src="raw/samples/diff_gender/source/1.wav"></audio></td>
        <td><audio style="width: 100px;" controls src="raw/samples/diff_gender/target/1.wav"></audio></td>
        <td><audio style="width: 100px;" controls src="raw/samples/diff_gender/triaan_vc/1.wav"></audio></td>
        <td><audio style="width: 100px;" controls src="raw/samples/diff_gender/openvoice/1.wav"></audio></td>
        <td><audio style="width: 100px;" controls src="raw/samples/diff_gender/cosyvoice/1.wav"></audio></td>
        <td><audio style="width: 100px;" controls src="raw/samples/diff_gender/starvc/1.wav"></audio></td>
      </tr>
    `;

    // 其余 6 行 (2 ~ 7)
    for (let i = 2; i <= diffGenderRows; i++) {
      sample_data += `
        <tr>
          <td><audio style="width: 100px;" controls src="raw/samples/diff_gender/source/${i}.wav"></audio></td>
          <td><audio style="width: 100px;" controls src="raw/samples/diff_gender/target/${i}.wav"></audio></td>
          <td><audio style="width: 100px;" controls src="raw/samples/diff_gender/triaan_vc/${i}.wav"></audio></td>
          <td><audio style="width: 100px;" controls src="raw/samples/diff_gender/openvoice/${i}.wav"></audio></td>
          <td><audio style="width: 100px;" controls src="raw/samples/diff_gender/cosyvoice/${i}.wav"></audio></td>
          <td><audio style="width: 100px;" controls src="raw/samples/diff_gender/starvc/${i}.wav"></audio></td>
        </tr>
      `;
    }

    // 将拼接后的表格内容插入
    document.getElementById('tbody').innerHTML = sample_data;
  }
</script>

</html>