Not sure whether this is of particular interest but I have this old code for importing (a particular flavor of) flextext to JSON:
flextextToJSON.js
class FlexTextParser {
constructor({dom}){
this.dom = dom
}
get text(){
return {
metadata: {},
sentences: this.sentences
}
}
get sentences(){
let phraseNodes = Array.from(this.doc.querySelectorAll('phrase'))
let sentences = phraseNodes.map(phraseNode => this.phraseNodeToSentence(phraseNode))
return sentences
}
phraseNodeToSentence(phraseNode){
let children = Array.from(phraseNode.children)
let orthographic = children.find(el => el.matches('item[type="gls"], item[type="punct"]')).textContent
let translation = children.find(el => el.matches('item[type="lit"]')).textContent
let words = Array.from(phraseNode.querySelectorAll('word'))
.map(wordNode => this.wordNodeToWord(wordNode))
.filter(word => word.form && word.gloss)
let transcription = words.map(word => word.form).join(' ')
let sentence = {
orthographic,
transcription: orthographic,
translation,
words
}
return sentence
}
isValidMorphNode(morphNode){
return morphNode.querySelector('item[type="gls"]')
&& morphNode.querySelector('item[type="txt"]')
&& morphNode.getAttribute('type')
}
parseMorphemesNode(){
// returns an array of Words
return Array.from(this.doc.querySelectorAll('morph'))
.map(morph => {
let type = morph.getAttribute('type') // suffix, stem (prefix)
let form = morph.querySelector('item[type="txt"]').textContent
let gloss = morph.querySelector('item[type="gls"]').textContent
if(type == 'suffix'){ gloss = `-${gloss}`}
if(type == 'prefix'){ gloss = `${gloss}-`}
return {
form,
gloss
}
})
}
morphNodeToMorpheme(morphNode){
let type = morphNode.getAttribute('type') || null // suffix, stem (prefix)
let form = morphNode.querySelector('item[type="txt"]').textContent || ""
let gloss = morphNode.querySelector('item[type="gls"]').textContent || ""
let morpheme = {
form,
gloss,
type
}
if(type == 'suffix' && gloss){ gloss = `-${gloss}`}
if(type == 'prefix' && gloss){ gloss = `${gloss}-`}
return {
form,
gloss
}
}
wordNodeToWord(wordNode){
let morphNodes = Array.from(wordNode.querySelectorAll('morphemes morph'))
let morphemes = morphNodes
.filter(morphNode => this.isValidMorphNode(morphNode))
.map(morphNode => this.morphNodeToMorpheme(morphNode))
let word = morphemes.reduce((word, morpheme) => {
word.form += morpheme.form
word.gloss += morpheme.gloss
return word
}, { form: "", gloss: ""})
return word
}
toJSON(){
return this.text
}
}
// Here’s how you would use it on a flextext:
new FlexTextParser({doc:
This is by no means great code… not sure if using a class
makes sense, for instance. But whatevs, it solved my problem at the time.
But of course it doesn’t work unless the fields are just so. The problem is, as you mention, the fact that the labels for things don’t seem to be consistent. For example, I just randomly found some flextext online and tried my parser on that… it failed of course.
https://fiona.uni-hamburg.de/1860492f/silp1981stonyoldwomanflk.flextext
The morph
type, for instance, is quite different from what I had been working with:
<morph type="suffix" guid="d7f713dd-e8cf-11d3-9764-00c04f186933">
<item type="txt" lang="qaa-x-aaa">-ɨ</item>
<item type="cf" lang="qaa-x-aaa">-ɨ</item>
<item type="gls" lang="ru">EP</item>
<item type="gls" lang="en">EP</item>
<item type="msa" lang="en">infl:ins</item>
</morph>
There are two nodes that match item[type="gls"]
… so I guess they are to be distinguished by their lang
tag…
I dunno. It’s complicated. This is kind of what drives me bonkers about XML to be honest. It’s not that data in XML isn’t well structured, it is. It’s just that it isn’t very self-describing as far as programming languages are concerned. I recognize that we need to be able to parse XML moving forward so we can build on existing documentation, but when I have the choice, personally, I much rather deal with JSON in a loosey-goosey way. Once there is a data structure available (objects and arrays, basically), it’s easy enough to poke around and see what is actually there.
I may be mistaken but it sems quite difficult to write a general FlexTextToJSON
function.