syllabify : prise en compte langues sans espace insécable avant les
[nenuvar.git] / scripts / syllabify.py
blob28181d072683fd5aed931e0385299b56cb8b51bd
1 # -*- coding: utf-8 -*-
2 import re, sys, argparse
4 class Sign():
5 """
6 Represent a syllable constituent (a single alphabetical character),
7 with other text (mute characters, punctuation, spaces, etc.) attached
8 to it.
10 sign.get_char() gives the alphabetical syllable consituent.
11 sign.get_text() gives the whole text attached to the sign
12 """
13 def __init__(self, c):
14 self._sign = c
15 self._text = ""
16 self._word_end = False
17 self._word_start = False
18 self._forced_syllable_end = False
19 self._forced_syllable_start = False
21 def add_text(self, str):
22 self._text = "".join((self._text, str))
24 def set_forced_syllable_end(self):
25 self._forced_syllable_end = True
27 def forced_syllable_end(self):
28 return self._forced_syllable_end
30 def set_forced_syllable_start(self):
31 self._forced_syllable_start = True
33 def forced_syllable_start(self):
34 return self._forced_syllable_start
36 def word_end(self):
37 return self._word_end
39 def set_word_end(self):
40 self._word_end = True
42 def word_start(self):
43 return self._word_start
45 def set_word_start(self):
46 self._word_start = True
48 def get_char(self):
49 return self._sign
51 def get_text(self):
52 return self._text
54 class SignTokenizer():
55 """
56 Provides a method for build a list of signs from a decorated verse string.
57 Usage:
58 sign_tokenizer = SignTokenizer()
59 signs = sign_tokenizer.tokenize("Un ver avec des décorations")
60 signs being a list of Sign objects
62 The decorations can be:
63 - "°" for grouping 'empty' words to 'full' words.
64 Example:
65 En°vain j'ay respecté la°celebre memoire
66 Des°Heros des°siecles passez ;
67 Can be overriden with word_separator_markers constructor keyword
69 - "*" for marking a mute letter (e.g. a 'h').
70 Example:
71 Et c'est l'*Hyver qui les°rassemble.
72 Can be overriden with mute_character_marker constructor keyword
74 - "=" for forcing syllable ends, e.g. for marking a diaeresis.
75 Example:
76 Trop *heureux Phrygi=ens, venez icy l'attendre.
77 Can be overriden with forced_syllable_end_marker constructor keyword
79 - other unused markers: < > { }
80 Can be overriden with ignored_markers constructor keyword
81 """
82 def __init__(self,
83 language = "fr",
84 word_separators = " -",
85 word_separator_markers = "°",
86 simple_punctuations = ".,",
87 double_punctuations = ":;?!",
88 apostrophes = "'’",
89 forced_syllable_end_marker = "=",
90 mute_character_marker = "*",
91 ignored_markers = "<>{}",
92 ignored_characters = "[]()|/~_"
94 self.word_separators = word_separators
95 self.word_separator_markers = word_separator_markers
96 self.all_word_separators = "".join((word_separators,
97 word_separator_markers))
98 self.simple_punctuations = simple_punctuations
99 self.double_punctuations = double_punctuations
100 self.space_before_double_punctuations = (language == "fr")
101 self.apostrophes = apostrophes
102 self.forced_syllable_end_marker = forced_syllable_end_marker
103 self.mute_character_marker = mute_character_marker
104 self.ignored_markers = ignored_markers
105 self.ignored_characters = ignored_characters
106 self.punctuation_re = re.compile(
107 " *([{}{}])".format(self.simple_punctuations,
108 self.double_punctuations))
109 self.et_re = re.compile("([Ee]t)({})".format(
110 "|".join(self.all_word_separators)))
112 def _reset(self):
113 self._prefix = ""
114 self._current_sign = None
115 self._signs = []
117 def _add_sign(self, c):
118 self._current_sign = Sign(c.lower())
119 self._signs.append(self._current_sign)
120 if self._prefix != "":
121 self._current_sign.add_text(self._prefix)
122 self._prefix = ""
124 def _add_prefix(self, prefix):
125 self._prefix = "".join((self._prefix, prefix))
127 def _add_text(self, text):
128 self._current_sign.add_text(text)
130 def _set_forced_syllable_end(self):
131 self._current_sign.set_forced_syllable_end()
133 def _set_word_end(self):
134 self._current_sign.set_word_end()
136 def tokenize(self, verse_text):
137 self._reset()
138 sign_count = len(verse_text)
139 i = 0
140 mute_next = False
141 word_start = True
142 while (i < sign_count):
143 c = verse_text[i]
144 punctuation_match = self.punctuation_re.match(verse_text[i:])
145 ## Markers: they are not real text
146 # forced syllable end marker
147 if c == self.forced_syllable_end_marker:
148 self._set_forced_syllable_end()
149 i += 1
150 # mute character marker
151 elif c == self.mute_character_marker:
152 i += 1
153 mute_next = True
154 # ignored markers
155 elif c in self.ignored_markers:
156 i += 1
157 ## Actual text
158 # apostroph
159 elif c in self.apostrophes:
160 self._add_text("’")
161 i += 1
162 # punctuation
163 elif punctuation_match:
164 punct = punctuation_match.group(1)
165 if self.space_before_double_punctuations and punct in self.double_punctuations:
166 self._add_text("\u00A0")
167 self._add_text(punct)
168 i += len(punctuation_match.group(0))
169 self._set_word_end()
170 word_start = True
171 # word separator
172 elif c in self.all_word_separators:
173 self._set_word_end()
174 word_start = True
175 if c in self.word_separator_markers:
176 self._add_text(" ")
177 else:
178 self._add_text(c)
179 i += 1
180 # ignored characters
181 elif c in self.ignored_characters:
182 self._add_text(c)
183 i += 1
184 # consonant or vowel
185 else:
186 if mute_next:
187 self._add_prefix(c)
188 mute_next = False
189 i += 1
190 else:
191 m = word_start and self.et_re.match(verse_text[i:])
192 if m:
193 # special case: et -> &
194 self._add_sign("&")
195 self._add_text(m.group(1))
196 self._add_text(" ")
197 self._set_word_end()
198 word_start = True
199 i += len(m.group(0))
200 else:
201 # consonant or vowel
202 self._add_sign(c)
203 self._add_text(c)
204 word_start = False
205 i += 1
206 # the last character is at word end and syllable end
207 self._set_word_end()
208 self._set_forced_syllable_end()
209 # set word_start and forced_syllable_start for characters
210 # following a word end or forced_syllable_end
211 at_word_start = True
212 at_syllable_start = True
213 for sign in self._signs:
214 if at_word_start:
215 sign.set_word_start()
216 if at_syllable_start:
217 sign.set_forced_syllable_start()
218 at_word_start = sign.word_end()
219 at_syllable_start = sign.forced_syllable_end()
220 return self._signs
222 def get_chars(self):
223 return "".join([c.get_char() for c in self._signs])
225 def get_full_verse(self):
226 return "".join([c.get_text() for c in self._signs])
229 class Syllable():
231 Represents a syllable, consisting in a list of signs.
233 def __init__(self):
234 self._signs = []
236 def add_sign(self, sign):
237 self._signs.append(sign)
239 def add_signs(self, signs):
240 self._signs.extend(signs)
242 def get_signs(self):
243 return self._signs
245 def set_signs(self, signs):
246 self._signs = signs
248 def get_text(self):
249 return "".join([sign.get_text() for sign in self._signs])
251 def get_chars(self):
252 return "".join([sign.get_char() for sign in self._signs])
254 def is_empty(self):
255 return not self._signs
257 def at_word_start(self):
258 return self._signs[0].word_start()
260 def at_word_end(self):
261 return self._signs[-1].word_end()
263 def is_feminine(self):
265 A syllable is feminine iff:
266 - it is placed at word end
267 - it contains exactly one vowel, which is 'e' or 'ë', at the end
268 (with possibly a final s)
271 if self.at_word_end():
272 chars = "".join([sign.get_char() for sign in self._signs])
273 # special cases:
274 # exact words: ces, mes, ses, tes, les, des, es
275 # have no feminine e
276 if (self.at_word_start()
277 and re.match("^[cmstld]?es$", chars)):
278 return False
279 vowels = ""
280 for char in chars:
281 if char in "aàâäeëéèêœiìïîoôòuùûüy&":
282 vowels = "".join((vowels, char))
283 return not not (
284 # only one vowel: e or ë, and word ends with -e or -es
285 ((vowels == "e" or vowels == "ë")
286 and (vowels == chars[-1] or (vowels + "s") == chars[-2:]))
287 # two vowels: "que?" or "gues?"
288 or ((vowels == "ue" or vowels == "uë")
289 and re.search("[qg]u[eë]s?", chars)))
290 return False
293 class SyllableTokenizer():
295 Provides a method for build a list of syllables from a list of signs.
296 Usage:
297 sign_tokenizer = SignTokenizer()
298 syllable_tokenizer = SyllableTokenizer()
299 signs = sign_tokenizer.tokenize("Un ver avec des décorations")
300 syllables = syllable_tokenizer.tokenize(signs)
301 syllables being a list of Syllable objects
303 def __init__(self,
304 e_vowels = "eë",
305 other_vowels = "aàâäéèêœiìïîoôòuùûüy&",
306 consonants_sonority_levels = { 'liquid' : "lrh",
307 'nasal' : "mn",
308 'constrictive' : "çfjsvxz",
309 'occlusive' : "bcdgkpqt" }
311 self.e_vowels = e_vowels
312 self.other_vowels = other_vowels
313 self.vowels = "".join((e_vowels, other_vowels))
314 self.consonants_sonority_levels = consonants_sonority_levels
315 self.consonants = "".join(consonants_sonority_levels.values())
316 self._reset()
317 self.re = {
318 # [something][vowel (no feminine e)]<space>[vowel]
319 'hiatus' : ".[{}][{}]".format(self.other_vowels, self.vowels),
320 # <word start>s[cçpt][vowel]
321 '^sca' : "s[cçpt][{}]".format(self.vowels),
322 # <word start>s[cp][lr][vowel]
323 '^scla' : "s[cp][lr][{}]".format(self.vowels),
324 # <word start>ps[vowel]
325 '^psa' : "ps[{}]".format(self.vowels),
326 # gn[vowel]
327 'gna' : "gn[{}]".format(self.vowels),
328 # [occlusive bcdgkpqt or constrictive çfjvxz][liquid lrh][vowel]
329 'bla' : "[{}{}][{}][{}]".format(
330 self.consonants_sonority_levels['occlusive'],
331 self.consonants_sonority_levels['constrictive'].replace("s", ""),
332 self.consonants_sonority_levels['liquid'],
333 self.vowels),
334 # [tpc]h[rl][vowel]
335 'thra' : "[tpc]h[rl][{}]".format(self.vowels),
336 # [consonant][vowel]
337 'ba' : "[{}][{}]".format(self.consonants, self.vowels),
339 self.compiled_re = {}
340 for (key, string) in self.re.items():
341 self.compiled_re[key] = re.compile(string)
342 self._match_data = None
344 def _match(self, re_key, text):
345 self._match_data = self.compiled_re[re_key].match(text)
346 return self._match_data
348 def _get_match_data(self):
349 return self._match_data
351 def _reset(self):
352 self._syllables = []
353 self._current_syllable = None
354 self._first_syllable = Syllable()
356 def _start_new_syllable(self):
357 if (self._first_syllable and not self._first_syllable.is_empty()):
358 self._syllables.append(self._first_syllable)
359 if not (self._current_syllable
360 and self._current_syllable.is_empty()):
361 self._current_syllable = Syllable()
362 self._syllables.append(self._current_syllable)
363 self._first_syllable = None
365 def _add_sign(self, text):
366 if self._first_syllable:
367 self._first_syllable.add_sign(text)
368 else:
369 self._current_syllable.add_sign(text)
371 def get_syllables(self):
372 return self._syllables
374 def tokenize(self, signs):
375 self._reset()
376 verse_text = "".join([sign.get_char() for sign in signs])
377 sign_count = len(signs)
378 i = 0
379 while (i < sign_count):
380 word_start = signs[i].word_start()
381 # forced syllable ends
382 if (i > 0 and signs[i].forced_syllable_start()):
383 self._start_new_syllable()
385 # Hiatus
386 # ^[vowel]<space>
387 if (i == 0
388 and verse_text[i] in self.vowels
389 and signs[i].word_end()):
390 self._add_sign(signs[i])
391 i += 1
392 self._start_new_syllable()
393 # [something][vowel (no feminine e)]<space>[vowel]
394 elif (self._match('hiatus', verse_text[i:])
395 and signs[i+1].word_end()):
396 self._add_sign(signs[i])
397 self._add_sign(signs[i+1])
398 self._start_new_syllable()
399 self._add_sign(signs[i+2])
400 i += 3
401 elif (
402 # <word start>s[cçpt][vowel]
403 (word_start and self._match('^sca', verse_text[i:])
404 and not signs[i].word_end())
405 # <word start>s[cp][lr][vowel]
406 or (word_start and self._match('^scla', verse_text[i:])
407 and not signs[i].word_end()
408 and not signs[i+1].word_end())
409 # <word start>ps[vowel]
410 or (word_start and self._match('^psa', verse_text[i:]))
411 # gn[vowel]
412 or (self._match('gna', verse_text[i:])
413 and not signs[i].word_end())
414 # [bcdgkpqtçfjvxz][lrh][vowel]
415 or (self._match('bla', verse_text[i:])
416 and not signs[i].word_end())
417 # [tpc]h[rl][vowel]
418 or (self._match('thra', verse_text[i:])
419 and not signs[i+1].word_end())
420 # [consonant][vowel]
421 or self._match('ba', verse_text[i:])
423 match = self._get_match_data().group(0)
424 self._start_new_syllable()
425 for x in match:
426 self._add_sign(signs[i])
427 i += 1
428 else:
429 self._add_sign(signs[i])
430 i += 1
431 return self.get_syllables()
434 class SyllableTokenizerWithWordSeparation(SyllableTokenizer):
436 A specialized SyllableTokenizer which preferes syllable
437 breaking between words when possible. For instance:
439 "tant attendu"
440 gives: tant / at / ten / du
441 iso: tan / t at / ten / du
443 This is useful when breaking verses for lyrics.
445 Usage:
446 sign_tokenizer = SignTokenizer()
447 syllable_tokenizer = SyllableTokenizerWithWordSeparation()
448 signs = sign_tokenizer.tokenize("Un ver avec des décorations")
449 syllables = syllable_tokenizer.tokenize(signs)
450 syllables being a list of Syllable objects
452 def force_word_separation(self, syllables = None):
453 syllables = syllables or self._syllables
454 syllable_count = len(syllables)
455 prev_syllable = syllables[0]
456 for this_syllable in syllables[1:]:
457 signs = this_syllable.get_signs()
458 if not signs[0].word_start() and signs[1:]:
459 tokens_count = len(signs)
460 i = 1
461 while (not signs[i].word_start()
462 or not signs[i].get_char() in self.vowels):
463 i += 1
464 if i == tokens_count:
465 break
466 else:
467 # we found a vowel at word start at index i
468 # signs from indices 0 to i-1 go to the previous syllable
469 prev_syllable.add_signs(signs[0:i])
470 this_syllable.set_signs(signs[i:])
471 prev_syllable = this_syllable
472 return syllables
474 def tokenize(self, signs):
475 SyllableTokenizer.tokenize(self, signs)
476 return self.force_word_separation()
478 class Verse():
480 A verse
482 Usage:
483 verse = Verse("Un ver avec des décorations")
484 # possible pass sign and syllable tokenizers to split:
485 verse.split()
486 verse.get_syllables()
487 => ["Un ", "ve", "r a", "vec ", "des ", "dé", "co", "ra", "tions"]
490 def __init__(self, text, lineno = None):
491 self._text = text
492 self._syllables = []
493 self._lineno = lineno
495 def get_syllables(self):
496 return [syll.get_text() for syll in self._syllables]
498 def get_text(self):
499 return "".join([syll.get_text() for syll in self._syllables])
501 def syllabify(self,
502 sign_tokenizer = SignTokenizer(),
503 syllable_tokenizer = SyllableTokenizer()
505 self._syllables = syllable_tokenizer.tokenize(
506 sign_tokenizer.tokenize(self._text))
508 def get_metric(self):
509 return len(self._syllables) - (1 if self._syllables[-1].is_feminine() else 0)
511 def hyphenate(self, hyphen = "-", add_space = False):
512 syllables = []
513 i = 0
514 count = len(self._syllables)
515 for syllable in self._syllables:
516 if (i > 0) and not syllable.at_word_start():
517 syllables.append(hyphen)
518 text = syllable.get_text()
519 syllables.append(text)
520 if add_space:
521 verse_end = (i == count - 1)
522 # if syllable is word end and do not end with a space,
523 # add it (unless at verse end)
524 if (not verse_end
525 and syllable.at_word_end()
526 and text[-1] != " "):
527 syllables.append(" ")
528 i += 1
529 return "".join(syllables)
531 class Corpus():
533 A corpus, consisting of verses.
535 Example:
536 To generate LilyPond lyrics (where syllables in a word are separated
537 with " -- ")
539 corpus = Corpus()
540 corpus.add_verse(["premier ver", "second ver..."])
541 corpus.syllabify(syllable_tokenizer = SyllableTokenizerWithWordSeparation())
542 corpus.get_hyphenated_verses(hyphen = " -- ")
543 => ["pre -- mier ver", "se -- cond ver..."]
545 def __init__(self, filename = None):
546 self._verses = []
547 self._filename = filename
549 def add_verse(self, verse, lineno = None):
551 Add verse (a string) to the corpus.
553 self._verses.append(Verse(verse, lineno))
555 def get_verses(self):
556 return self._verses
558 def syllabify(self,
559 sign_tokenizer = SignTokenizer(),
560 syllable_tokenizer = SyllableTokenizer()):
562 Syllabify all the corpus verses.
564 for verse in self._verses:
565 verse.syllabify(sign_tokenizer, syllable_tokenizer)
567 def get_hyphenated_verses(self, hyphen = "-", add_space = False):
569 Return the hyphenated verses (list of strings) contained in the
570 corpus.
571 Corpus.syllabify() is supposed to have been called before.
573 return [verse.hyphenate(hyphen, add_space)
574 for verse in self._verses]
576 class CorpusReader():
578 def read(self, filename = "-"):
580 Read a corpus file (or stdin if filename is "-")
581 and produce a Corpus object.
583 file = open(filename, 'r') if (filename != "-") else sys.stdin
584 corpus = Corpus(filename)
585 lineno = 0
586 for line in file:
587 line = line.strip()
588 lineno += 1
589 # skip empty lines
590 if line == "":
591 pass
592 # skip comments
593 elif re.match(r"^//", line):
594 # TODO: do something
595 pass
596 # TODO: titling directives
597 elif re.match(r"^#", line):
598 pass
599 # a verse
600 else:
601 # verse format:
602 # verse text TAB+ [properties]
603 # where properties can be:
604 # [LB]+ breve/long syllables indicators
605 # [AT]+ schema (?)
606 # R "refrain"
607 # D "double"
608 # other lilypond code
609 # for now, we only keep the verse text itself
610 text = re.sub(r"([^\t]+)\t.*$", r"\1", line)
611 corpus.add_verse(text, lineno)
612 file.close()
613 return corpus
615 def main():
617 Syllabify and print verses.
619 parser = argparse.ArgumentParser(
620 description='Verse syllabication.',
621 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
622 parser.add_argument(
623 '--verse',
624 metavar="words",
625 nargs='+',
626 help='verse words to syllabify (if no corpus is provided)')
627 parser.add_argument(
628 '--corpus',
629 help="Corpus file to syllabify. Use - for reading from stdin")
630 parser.add_argument(
631 '--hyphen',
632 default=" -- ",
633 help="String to be used when hyphenating a verse.")
634 parser.add_argument(
635 '--format',
636 default="{hyphenated_verse}",
637 help="""Python format string for outputing the verse.
638 Possible keywords, to be used between curly braces in the format string,
640 *) hyphenated_verse: the verse after applying hyphenation
641 *) verse: the verse without hyphenation
642 *) metric: the verse metric (a number).""")
643 args = vars(parser.parse_args())
645 if args['corpus']:
646 # Syllabify a corpus
647 reader = CorpusReader()
648 corpus = reader.read(args['corpus'])
649 corpus.syllabify(
650 syllable_tokenizer = SyllableTokenizerWithWordSeparation())
651 for verse in corpus.get_verses():
652 hyphenated_verse = verse.hyphenate(hyphen = args['hyphen'],
653 add_space = True)
654 print(args['format'].format(verse = verse.get_text(),
655 hyphenated_verse = hyphenated_verse,
656 metric = verse.get_metric()))
657 elif args['verse']:
658 # read verse on command line arguments
659 verse = Verse(" ".join(args['verse']))
660 verse.syllabify(
661 syllable_tokenizer = SyllableTokenizerWithWordSeparation())
662 hyphenated_verse = verse.hyphenate(hyphen = args['hyphen'], add_space = True)
663 print(args['format'].format(verse = verse.get_text(),
664 hyphenated_verse = hyphenated_verse,
665 metric = verse.get_metric()))
666 else:
667 parser.print_help()
669 if __name__ == '__main__':
670 main()