Brushing up on your Python Skills

The basics of this class are taught in Python. And the neglected basics of ALP is preprocessing our texts.

Preprocessing for ALP is much broader than what computer and data scientists usually mean. Philological conventions in printed and digital publications hold much more information that needs to be correctly parsed before any computational manipulation (analysis).

In this notebook, we are going to provide four examples of messy texts: two in Egyptian and two in Akkadian. We are going to work through the process of how we should parse the texts, what information we are losing when parsing them, and brushing up on basic Python syntax and functions while we’re at it.

Akkadian Example 1:

https://cdli.mpiwg-berlin.mpg.de/artifacts/225104

&P225104 = TIM 10, 134 #atf: use lexical #Nippur 2N-T496; proverb; Alster proverbs @tablet @obverse @column 1 1. dub-sar hu-ru 2. a-ga-asz-gi4-gi4-me!(|ME+ASZ|)-e-ne 3. dub-sar hu-ru 4. a-ga-asz-gi4-gi4-me!(|ME+ASZ|)-e#-ne @reverse @column 1 1. igi-bi 3(disz) 3(asz) 6(disz)

Task 1:

How do we turn this raw text into a list of words?

akk1 = """&P225104 = TIM 10, 134
#atf: use lexical
#Nippur 2N-T496; proverb; Alster proverbs
@tablet
@obverse
@column 1
1. dub-sar hu-ru
2. a-ga-asz-gi4-gi4-me!(|ME+ASZ|)-e-ne
3. dub-sar hu-ru
4. a-ga-asz-gi4-gi4-me!(|ME+ASZ|)-e#-ne
@reverse
@column 1
1. igi-bi 3(disz) 3(asz) 6(disz)
"""

akk1

# split string to lines of texts
lines = akk1.split("\n")
lines

# remove blanks

lines_full = []
for line in lines:
  if line != "":
    lines_full.append(line)

lines_full

# keep only lines that begin with a number
# use regular expressions

import re

text_lines = []
for line in lines_full:
  if re.match("^\d", line) != None:
    text_lines.append(line)

text_lines

# separate lines into words

words_appended = []
words_extended = []
for line in text_lines:
  temp_words = line.split()
  words_appended.append(temp_words[1:]) # creates list of lists
  words_extended.extend(temp_words[1:]) # creates list

print(words_appended)
print("-------------------------------")
print(words_extended)

# rewrite the code above as a function

What information did we lose when preprocessing the texts in this way?

Task 2:

Create a dictionary from the raw texts, of the following format:

{"pnum": ...
 "textID": ...
 "surface": [{
  "surfaceType": ...
  "columns": [{
    "columnNum": ...
    "text": [{
      "lineNum": ...
      "words": [..., ..., ...]
    }]
  }]
 }]}

# separate text into lines

lines = akk1.split("\n")

lines_full = []
for line in lines:
  if line != "":
    lines_full.append(line)

lines_full

# store the pnum and textID in variables

text_ids = lines_full[0]
pnum, textID = text_ids.split("=")

pnum = pnum.strip()[1:]
textID = textID.strip()
print(pnum)
print(textID)

# create a dictionary for each surface (simple no regex method)
# what do you do when you have different types of inscribed object? (e.g. cylinder, prism, bowl, slab, etc.)

valid_surface_values = ["@obverse", "@reverse"]

surface_idx = []

for index, line in enumerate(lines_full):
  if line in valid_surface_values: # what is dangerous in this line? if the line of text is not exactly(!) part of surface, no lines will be found
    surface_idx.append(index)
print(surface_idx)

# create a dictionary for each surface (complicated with regex method)
# what do you do when you have different type of inscribed object? (e.g. cylinder, prism, bowl, slab, etc.)

valid_surface_values = ["@obverse", "@reverse"]

pattern = r"^(?:" + "|".join([re.escape(value) for value in valid_surface_values]) + ")" # This is called a list comprehension

surface_idx = []

for index, line in enumerate(lines_full): # returns the index for the line and the content of the line
  if re.match(pattern, line) != None:
    surface_idx.append(index)
print(surface_idx)

# same code like in cell above but without list comprehension
# create a dictionary for each surface (complicated with regex method)
# what do you do when you have different type of inscribed object? (e.g. cylinder, prism, bowl, slab, etc.)

valid_surface_values = ["@obverse", "@reverse"]

#pattern = r"^(?:" + "|".join([re.escape(value) for value in valid_surface_values]) + ")" # This is called a list comprehension

escaped_values = []
for value in valid_surface_values:
    escaped_values.append(re.escape(value))
print(escaped_values)

pattern = r"^(?:" + "|".join(escaped_values) + ")"
    
surface_idx = []

for index, line in enumerate(lines_full): # returns the index for the line and the content of the line
  if re.match(pattern, line) != None:
    surface_idx.append(index)
print(surface_idx)

# use surface indices to create surface dictionaries
# surfaceType; columnNum; lineNum; words
# surfaceType extracted using id values of lines
# columnNum needs first to check whether a column actually exists, then extracted using regex(?)/tokenize on space for any number after the word column
# lineNum is regex for any line that begins with a number plus any tags attached: how would be best to define line numbers, as integers or as string variables?
# words extracted from each text line after lineNum using regex and tokenized on spaces

for index, id in enumerate(surface_idx):
    surfaceType = lines_full[id].replace('@', '')
    print(index, id)
    if index < len(surface_idx) - 1:
        end_of_surface = surface_idx[index+1]
    else:
        end_of_surface = len(lines_full)

    # Extract the text content for the current surface designation
    surface_content = lines_full[id+1:end_of_surface]

    # Print the surface type and its content
    print(f"Surface Type: {surfaceType}")
    # print("Content:")
    # print('\n'.join(surface_content))
    print('---')

    # Extract column number, line numbers, and words for each surface content
    for line in surface_content:
        columnNum = None
        lineNum = None
        words = []

        # Check if the line contains a column number
        if '@column' in line:
            parts = line.split()
            if len(parts) >= 2:
                try:
                    columnNum = int(parts[1])
                except ValueError:
                    pass
            print(f"Column Number: {columnNum}")
            print('---')
            continue  # Skip processing the line with @column

        # Check if the line contains a line number
        if '.' in line:
            parts = line.split('.')
            if len(parts) >= 2:
                lineNum = parts[0].strip()

        # Tokenize the words in the line
        if lineNum:
            words = parts[1].strip().split()
        else:
            words = line.strip().split()

        # Print the extracted information for each line
        print(f"Line Number: {lineNum}")
        print(f"Words: {words}")
        print('---')

# Combine the surfaces and metadata into one dictionary

output = {
    "pnum": pnum,
    "textID": textID,
    "surface": []
}

for index, id in enumerate(surface_idx):
    surfaceType = lines_full[id].replace('@', '')
    surface = {
        "surfaceType": surfaceType,
        "columns": []
    }

    if index < len(surface_idx) - 1:
        end_of_surface = surface_idx[index+1]
    else:
        end_of_surface = len(lines_full)

    # Extract the text content for the current surface designation
    surface_content = lines_full[id+1:end_of_surface]

    # Extract column number, line numbers, and words for each surface content
    columnNum = None
    column = {
        "columnNum": None,
        "text": []
    }
    for line in surface_content:
        lineNum = None
        words = []

        # Check if the line contains a column number
        if '@column' in line:
            parts = line.split()
            if len(parts) >= 2:
                try:
                    columnNum = int(parts[1])
                    column["columnNum"] = columnNum
                except ValueError:
                    pass
            continue  # Skip processing the line with @column

        # Check if the line contains a line number
        if '.' in line:
            parts = line.split('.')
            if len(parts) >= 2:
                lineNum = parts[0].strip()

        # Tokenize the words in the line
        if lineNum:
            words = parts[1].strip().split()
        else:
            words = line.strip().split()

        # Add the line information to the column
        line_info = {
            "lineNum": lineNum,
            "words": words
        }
        column["text"].append(line_info)

    # Add the column to the surface
    surface["columns"].append(column)

    # Add the surface to the output
    output["surface"].append(surface)

# Print the output in the specified dictionary format
print(output)

# Save the output dictionary as a JSON file

import json
with open(f"{pnum}.json", "w") as json_file:
    json.dump(output, json_file, indent=4)

# rewrite the code above into a function

print(json_file)

Egyptian Example 1:

A sentence from the sarcophagus of the Napatan king Aspelta (c. 600-580 BCE), found in his pyramid in Nuri, Sudan (Nu. 8), https://collections.mfa.org/objects/145117

Get the context of the sentence from the Thesaurus Linguae Aegyptiae: https://thesaurus-linguae-aegyptiae.de/text/27KHHMEP4VHSDH737F2OFLKNSE/sentences

# This Dictionary was created from the original json file

eg1 = {'publication_statement': {'credit_citation': 'Doris Topmann, Sentence ID 2CBOF5UQ7JGETCXG2CQKPCWDZM <https://github.com/thesaurus-linguae-aegyptiae/tla-raw-data/blob/v17/sentences/2CBOF5UQ7JGETCXG2CQKPCWDZM.json>, in: Thesaurus Linguae Aegyptiae: Raw Data <https://github.com/thesaurus-linguae-aegyptiae/tla-raw-data>, Corpus issue 17 (31 October 2022), ed. by Tonio Sebastian Richter & Daniel A. Werning on behalf of the Berlin-Brandenburgische Akademie der Wissenschaften and Hans-Werner Fischer-Elfert & Peter Dils on behalf of the Sächsische Akademie der Wissenschaften zu Leipzig (first published: 22 September 2023)', 'collection_editors': 'Tonio Sebastian Richter & Daniel A. Werning on behalf of the Berlin-Brandenburgische Akademie der Wissenschaften and Hans-Werner Fischer-Elfert & Peter Dils on behalf of the Sächsische Akademie der Wissenschaften zu Leipzig', 'data_engineers': {'input_software_BTS': ['Christoph Plutte', 'Jakob Höper'], 'database_curation': ['Simon D. Schweitzer'], 'data_transformation': ['Jakob Höper', 'R. Dominik Blöse', 'Daniel A. Werning']}, 'date_published_in_TLA': '2022-10-31', 'rawdata_first_published': '2023-09-22', 'corresponding_TLA_URL': 'https://thesaurus-linguae-aegyptiae.de/sentence/2CBOF5UQ7JGETCXG2CQKPCWDZM', 'license': 'Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) <https://creativecommons.org/licenses/by-sa/4.0/>'}, 'context': {'line': 'III', 'paragraph': None, 'pos': 7, 'textId': '27KHHMEP4VHSDH737F2OFLKNSE', 'textType': 'Text', 'variants': 1}, 'eclass': 'BTSSentence', 'glyphs': {'mdc_compact': None, 'unicode': None}, 'id': '2CBOF5UQ7JGETCXG2CQKPCWDZM', 'relations': {'contains': [{'eclass': 'BTSAnnotation', 'id': 'DYJEAXFKBJAXJPVLJGWREJZJ5M', 'ranges': [{'end': 'OKLGJLCEQFHU7HDRYUTYR352YA', 'start': '22TFIMS2CBBCFFCDSCAIT3HR3Y'}], 'type': 'ägyptologische Textsegmentierung'}], 'partOf': [{'eclass': 'BTSText', 'id': '27KHHMEP4VHSDH737F2OFLKNSE', 'name': 'Isis (HT 15, HT 14, HT 17)', 'type': 'Text'}]}, 'tokens': [{'annoTypes': ['ägyptologische Textsegmentierung'], 'flexion': {'btsGloss': '(unspecified)', 'lingGloss': 'PTCL', 'numeric': 3}, 'glyphs': {'mdc_artificially_aligned': True, 'mdc_compact': 'D35:N35', 'mdc_original': 'D35-N35', 'mdc_original_safe': None, 'mdc_tla': 'D35-N35', 'order': [1, 2], 'unicode': '𓂜𓈖'}, 'id': '22TFIMS2CBBCFFCDSCAIT3HR3Y', 'label': 'nn', 'lemma': {'POS': {'type': 'particle'}, 'id': '851961'}, 'transcription': {'mdc': 'nn', 'unicode': 'nn'}, 'translations': {'de': ['[Negationspartikel]']}, 'type': 'word'}, {'annoTypes': ['ägyptologische Textsegmentierung'], 'flexion': {'btsGloss': 'SC.act.ngem.nom.subj_Neg.nn', 'lingGloss': 'V\\tam.act', 'numeric': 210020}, 'glyphs': {'mdc_artificially_aligned': False, 'mdc_compact': 'W11-V28-A7', 'mdc_original': 'W11-V28-A7', 'mdc_original_safe': None, 'mdc_tla': 'W11-V28-A7', 'order': [2, 3, 4], 'unicode': '𓎼𓎛𓀉'}, 'id': 'IOLUGQXLCRGNLMTAPJ65LI7MHU', 'label': 'gḥ', 'lemma': {'POS': {'subtype': 'verb_3-lit', 'type': 'verb'}, 'id': '166480'}, 'transcription': {'mdc': 'gH', 'unicode': 'gḥ'}, 'translations': {'de': ['matt sein']}, 'type': 'word'}, {'annoTypes': ['ägyptologische Textsegmentierung'], 'flexion': {'btsGloss': 'Noun.pl.stpr.3sgm', 'lingGloss': 'N.f:pl:stpr', 'numeric': 70154}, 'glyphs': {'mdc_artificially_aligned': True, 'mdc_compact': 'D36:X1*F51B-Z2', 'mdc_original': 'D36-X1-F51B-Z2', 'mdc_original_safe': None, 'mdc_tla': 'D36-X1-F51B-Z2', 'order': [5, 6, 7, 8], 'unicode': '𓂝𓏏𓄹︀\U00013440𓏥'}, 'id': 'GUVBJUGCSVF5VN55PN6RYS4YLI', 'label': 'ꜥ,t.pl', 'lemma': {'POS': {'subtype': 'substantive_fem', 'type': 'substantive'}, 'id': '34550'}, 'transcription': {'mdc': 'a.t.PL', 'unicode': 'ꜥ.t.PL'}, 'translations': {'de': ['Glied; Körperteil']}, 'type': 'word'}, {'annoTypes': ['ägyptologische Textsegmentierung'], 'flexion': {'btsGloss': '(unspecified)', 'lingGloss': '-3sg.m', 'numeric': 3}, 'glyphs': {'mdc_artificially_aligned': False, 'mdc_compact': 'I9', 'mdc_original': 'I9', 'mdc_original_safe': None, 'mdc_tla': 'I9', 'order': [9], 'unicode': '𓆑'}, 'id': 'GIHCJ27JXVAM7GDUYWGEPKBRB4', 'label': '=f', 'lemma': {'POS': {'subtype': 'personal_pronoun', 'type': 'pronoun'}, 'id': '10050'}, 'transcription': {'mdc': '=f', 'unicode': '=f'}, 'translations': {'de': ['[Suffix Pron. sg.3.m.]']}, 'type': 'word'}, {'annoTypes': ['ägyptologische Textsegmentierung'], 'flexion': {'btsGloss': '(unspecified)', 'lingGloss': 'dem.f.pl', 'numeric': 3}, 'glyphs': {'mdc_artificially_aligned': True, 'mdc_compact': 'M17-Q3:N35', 'mdc_original': 'M17-Q3-N35', 'mdc_original_safe': None, 'mdc_tla': 'M17-Q3-N35', 'order': [10, 11, 12], 'unicode': '𓇋𓊪𓈖'}, 'id': 'Z6HTGGPBPRDT3OZTZNXRF2GRDA', 'label': 'jp〈t〉n', 'lemma': {'POS': {'subtype': 'demonstrative_pronoun', 'type': 'pronoun'}, 'id': '850009'}, 'transcription': {'mdc': 'jp〈t〉n', 'unicode': 'jp〈t〉n'}, 'translations': {'de': ['diese [Dem.Pron. pl.f.]']}, 'type': 'word'}, {'annoTypes': ['ägyptologische Textsegmentierung'], 'flexion': {'btsGloss': '(unspecified)', 'lingGloss': 'TITL', 'numeric': 3}, 'glyphs': {'mdc_artificially_aligned': False, 'mdc_compact': 'D4-Q1-A40', 'mdc_original': 'D4-Q1-A40', 'mdc_original_safe': None, 'mdc_tla': 'D4-Q1-A40', 'order': [13, 14, 15], 'unicode': '𓁹𓊨𓀭'}, 'id': 'UCFJWBLRKJG4NJWTWT22WDR2MU', 'label': 'Wsr,w', 'lemma': {'POS': {'subtype': 'title', 'type': 'epitheton_title'}, 'id': '49461'}, 'transcription': {'mdc': 'wsr.w', 'unicode': 'Wsr.w'}, 'translations': {'de': ['Osiris (Totentitel des Verstorbenen)']}, 'type': 'word'}, {'annoTypes': ['ägyptologische Textsegmentierung'], 'flexion': {'btsGloss': '(unspecified)', 'lingGloss': 'N', 'numeric': 3}, 'glyphs': {'mdc_artificially_aligned': True, 'mdc_compact': 'M23-X1:N35', 'mdc_original': 'M23-X1-N35', 'mdc_original_safe': None, 'mdc_tla': 'M23-X1-N35', 'order': [16, 17, 18], 'unicode': '𓇓𓏏𓈖'}, 'id': 'LI5FJI4ZUJEMPIKS5RQ5HHNBUE', 'label': 'nzw', 'lemma': {'POS': {'type': 'substantive'}, 'id': '88040'}, 'transcription': {'mdc': 'nzw', 'unicode': 'nzw'}, 'translations': {'de': ['König']}, 'type': 'word'}, {'annoTypes': ['ägyptologische Textsegmentierung'], 'flexion': {'btsGloss': '(unspecified)', 'lingGloss': 'ROYLN', 'numeric': 3}, 'glyphs': {'mdc_artificially_aligned': True, 'mdc_compact': 'V30:N17-N17', 'mdc_original': 'V30-N17-N17', 'mdc_original_safe': None, 'mdc_tla': 'V30-N17-N17', 'order': [19, 20, 21], 'unicode': '𓎟𓇿𓇿'}, 'id': 'ICADWHGbHkfdokpooG4eCy3Zfe8', 'label': 'nb-Tꜣ,du', 'lemma': {'POS': {'subtype': 'epith_king', 'type': 'epitheton_title'}, 'id': '400038'}, 'transcription': {'mdc': 'nb-tA.DU', 'unicode': 'nb-Tꜣ.DU'}, 'translations': {'de': ['Herr der Beiden Länder (Könige)']}, 'type': 'word'}, {'annoTypes': ['ägyptologische Textsegmentierung'], 'flexion': {'btsGloss': '(unspecified)', 'lingGloss': 'TITL', 'numeric': 3}, 'glyphs': {'mdc_artificially_aligned': True, 'mdc_compact': 'V30:D4-Aa1*X1:Y1', 'mdc_original': 'V30-D4-Aa1-X1-Y1', 'mdc_original_safe': None, 'mdc_tla': 'V30-D4-Aa1-X1-Y1', 'order': [22, 23, 24, 25, 26], 'unicode': '𓎟𓁹𓐍𓏏𓏛'}, 'id': 'ICADWHT2O1dc30SXuRZUlquIDpM', 'label': 'nb-jr(,t)-(j)ḫ,t', 'lemma': {'POS': {'subtype': 'title', 'type': 'epitheton_title'}, 'id': '400354'}, 'transcription': {'mdc': 'nb-jr(.t)-(j)x.t', 'unicode': 'nb-jr(.t)-(j)ḫ.t'}, 'translations': {'de': ['Herr des Rituals']}, 'type': 'word'}, {'annoTypes': ['ägyptologische Textsegmentierung'], 'flexion': {'btsGloss': '(unspecified)', 'lingGloss': 'ROYLN', 'numeric': 3}, 'glyphs': {'mdc_artificially_aligned': True, 'mdc_compact': '<-M17-O34:Q3-E23-N17->', 'mdc_original': '<-M17-O34-Q3-E23-N17->', 'mdc_original_safe': None, 'mdc_tla': '<-M17-O34-Q3-E23-N17->', 'order': [18, 19, 20, 21, 22, 23], 'unicode': '𓍹\U0001343c𓇋𓊃𓊪𓃭𓇿\U0001343d𓍺'}, 'id': 'J3MLYALWVNAMDDG33VZ3RIEEUA', 'label': 'Jsplt', 'lemma': {'POS': {'subtype': 'kings_name', 'type': 'entity_name'}, 'id': '850103'}, 'transcription': {'mdc': 'jsplt', 'unicode': 'Jsplt'}, 'translations': {'de': ['Aspelta']}, 'type': 'word'}, {'annoTypes': ['ägyptologische Textsegmentierung'], 'flexion': {'btsGloss': '(unspecified)', 'lingGloss': 'N.m:sg', 'numeric': 3}, 'glyphs': {'mdc_artificially_aligned': True, 'mdc_compact': 'U5:D36-P8h', 'mdc_original': 'U5-D36-P8h', 'mdc_original_safe': None, 'mdc_tla': 'U5-D36-P8h', 'order': [25, 26, 27], 'unicode': '𓌷𓂝𓊤︂'}, 'id': 'OKLGJLCEQFHU7HDRYUTYR352YA', 'label': 'mꜣꜥ-ḫrw', 'lemma': {'POS': {'subtype': 'substantive_masc', 'type': 'substantive'}, 'id': '66750'}, 'transcription': {'mdc': 'mAa-xrw', 'unicode': 'mꜣꜥ-ḫrw'}, 'translations': {'de': ['Gerechtfertigter (der selige Tote)']}, 'type': 'word'}], 'transcription': {'mdc': 'nn gH a.t.PL=f jp〈t〉n wsr.w nzw nb-tA.DU nb-jr(.t)-(j)x.t jsplt mAa-xrw', 'unicode': 'nn gḥ ꜥ.t.PL=f jp〈t〉n Wsr.w nzw nb-Tꜣ.DU nb-jr(.t)-(j)ḫ.t Jsplt mꜣꜥ-ḫrw'}, 'translations': {'de': ['Diese seine Glieder werden nicht matt sein, (die des) Osiris Königs, des Herrn der Beiden Länder, des Herrn des Rituals, Aspelta, des Gerechtfertigten.']}, 'type': None, 'wordCount': 11, 'editors': {'author': 'Doris Topmann', 'contributors': None, 'created': '2020-12-23 12:24:26', 'type': None, 'updated': '2022-08-29 10:22:01'}}
print(eg1)

# parse the dictionary (json)

unicodeHiero = []
transcription = []
translLemma = []
posLemma = []
tokenID = []

for text_word in eg1["tokens"] :
    print(text_word["glyphs"]["unicode"], text_word["transcription"]["unicode"], text_word["translations"]["de"][0], text_word["lemma"]["POS"]["type"], text_word["id"] )
    tokenID.append(text_word["id"])
    unicodeHiero.append(text_word["glyphs"]["unicode"])
    translLemma.append(text_word["translations"]["de"][0])
    posLemma.append(text_word["lemma"]["POS"]["type"])
    
    if text_word["transcription"]["unicode"][0] == "=" : # replace equal sign as it will cause trouble in spreadsheet software like MS Excel
        transcription.append(text_word["transcription"]["unicode"].replace("=", '⸗')) # U+2E17
    else :
        transcription.append(text_word["transcription"]["unicode"])

# get the ID of this sentence

sentenceID = eg1["id"]

# create a dataframe and fill it

import pandas as pd

df_eg = pd.DataFrame({
    'unicode_hieroglyphs': unicodeHiero,
    'unicode_transcription': transcription,
    'lemma_translation': translLemma,
    'part-of-speech': posLemma,
    'tokenID' : tokenID
})

df_eg

# save as *.csv

fileName = "aspelta_TLA_Sentence_" + sentenceID + ".csv"
df_eg.to_csv(fileName)

Akkadian Example 2:

consider the following Akkadian text:

http://www.achemenet.com//fr/item/?/sources-textuelles/textes-par-publication/Strassmaier_Cyrus/1665118

6 udu-nita₂ ina šu^II ^Iden-gi a-šú šá ^Id[

a-na 8 gín 4-tú kù-babbar i-na kù-babbar

šá i-di é [ o o o ] a-na é-babbar-ra

it-ta-din 5 udu-nita₂ šá ^Ika-ṣir

a-šú šá ^Iden-mu a-na 7 gín 4-tú

kù-babbar šá muh-hi dul-lu ^Imu-mu

ú-šá-hi-su a-na lìb-bi sì-na

1 udu-nita₂ a-na 1 gín 4-tú kù-babbar

ina šu^II ^Idutu-ba-šá^! [

1 udu-nita₂ šá ^IDU-[

a-na 1^? gín [

pap [13 udu-nita₂-meš

iti du₆ u₄ [o-kam] mu sag nam-lugal-la

^Iku-ra-áš lugal tin-tir^ki u kur-kur

How would you preprocess this text?

## raw text

akk2 = """6 udu-nita<sub>2</sub> <i>ina</i> šu<sup>II</sup> <sup>Id</sup>en-gi a-<i>šú šá</i> <sup>Id</sup>[
<i>a-na</i> 8 gín 4-<i>tú </i>kù-babbar<i> i-na</i> kù-babbar 
<i>šá</i> <i>i-di</i> é [ o o o ]<i> a-na</i> é-babbar-ra 
<i>it-ta-din</i> 5 udu-nita<sub>2</sub> <i>šá</i> <sup>I</sup><i>ka-ṣir</i> 
a-<i>šú šá</i> <sup>Id</sup>en-mu<i> a-na</i> 7 gín 4-<i>tú</i> 
kù-babbar <i>šá</i> <i>muh-hi</i> <i>dul-lu</i> <sup>I</sup>mu-mu 
<i>ú-šá-hi-su a-na</i> <i>lìb-bi</i> sì-<i>na</i> 
1 udu-nita<sub>2</sub><i> a-na</i> 1 gín 4-<i>tú </i>kù-babbar 
<i>ina</i> šu<sup>II</sup> <sup>Id</sup>utu-ba-<i>šá</i><sup>!</sup> [
1 udu-nita<sub>2</sub> <i>šá</i> <sup>I</sup>DU-[
<i>a-na</i> 1<sup>?</sup> gín [
pap [13 udu-nita<sub>2</sub>-meš
iti du<sub>6</sub> u<sub>4</sub> [o-kam] mu sag nam-lugal-la 
<sup>I</sup><i>ku-ra-áš</i> lugal tin-tir<sup>ki</sup> <i>u</i> kur-kur"""

akk2

## Clean the raw text in akk2

akk2 = akk2.replace("<sub>", "")
akk2 = akk2.replace("</sub>", "")

## Harmonize word and sign boundaries

# Shift blank before/after <i>/</i>
akk2 = akk2.replace(" </i>", "</i> ")
akk2 = akk2.replace("<i> ", " <i>")
#print(akk2)

import re
# Add hyphen before <sup> tags if there is no space before the tag
akk2 = re.sub(r'([^ ])(<sup>)', r'\1-\2', akk2)              
# Add hyphen after </sup> and </i> tags if there is no space after the tag
akk2 = re.sub(r'(</sup>|</i>)([^ ])', r'\1-\2', akk2)

# from a-<i>šú šá</i> to a-<i>šú</i> <i>šá</i>
pattern = r"(<i>[^<]*)([ -])"
while True:
    new_text = re.sub(pattern, r"\1</i>\2<i>", akk2)
    if new_text == akk2:  # End loop if there are no more differences between the existing one and the one created by re substitution
        break
    akk2 = new_text

# Replace double hyphens by simple ones
akk2 = akk2.replace("--", "-")

print(akk2)

## Create a dictionary with annotations

akk2_lineList = akk2.split("\n")

lines_list = []
line_count = 1
for line in akk2_lineList:
    temp_words = line.split()
    line_dict = {}
    
    line_dict['line_id'] = line_count
    line_dict['words'] = temp_words

    lines_list.append(line_dict)
    line_count += 1

#print(lines_list)

for line in lines_list :
    subword_list = []
    word_count = 1
    
    for word in line['words'] :
        subword_dict = {}
        
        sign_list = []
        
        if '-' in word: # if more than one sign, separated by hyphen
            temp_signs = word.split('-')
            sign_list.extend(temp_signs)
        else : # if only individual sign 
            sign_list.append(word)
        
        signs_per_word = []
        for sign in sign_list :
            signs_per_word.append(sign)
        
        subword_dict['word_id'] = word_count
        word_count += 1  
        subword_dict['signs'] = signs_per_word
        
        list_sign_func_dict = []
        for sign in subword_dict['signs'] :
            sign_func_dict = {}
            #print(sign)
            if sign.startswith('<i>') and sign.endswith('</i>') :
                sign_func_dict['sign'] = sign[3:-4]
                sign_func_dict['sign_function'] = 'phonogram'
               # sign = 
            elif sign.startswith('<sup>') and sign.endswith('</sup>') :
                sign_func_dict['sign'] = sign[5:-6]
                sign_func_dict['sign_function'] = 'classifier'
            else:
                sign_func_dict['sign'] = sign
                sign_func_dict['sign_function'] = 'logogram'
                
            list_sign_func_dict.append(sign_func_dict)
        #print(list_sign_func_dict)
        
            
        subword_dict['signs'] = list_sign_func_dict
        subword_list.append(subword_dict)

    line['words'] = subword_list
    

#print(lines_list)    
for line in lines_list:
    print(line)
    #for words in line:
    #    print(line['words'])

Egyptian Example 2:

How to deal with non-Unicode hieroglyphs ( tag + Gardiner number)

eg2_csv = """,text,line,word,ref,frag,norm,unicode_word,unicode,lemma_id,cf,pos,sense
92,3Z5EM77HJFCOPKZDDZFEMI6KVY,5,7,3Z5EM77HJFCOPKZDDZFEMI6KVY.5.7,gꜣu̯.w,gꜣu̯.w,<g>V96</g>𓅱,"['<', 'g', '>', 'V', '9', '6', '<', '/', 'g', '>', '𓅱']",166210,gꜣu̯,VERB,eng sein; entbehren; (jmdn.) Not leiden lassen
151,4WVXFJZFLNAYHP3Y5O5SLWD7DA,2,2,4WVXFJZFLNAYHP3Y5O5SLWD7DA.2.2,nꜥw,nꜥw,𓈖𓂝𓅱<g>I14C</g>𓏤,"['𓈖', '𓂝', '𓅱', '<', 'g', '>', 'I', '1', '4', 'C', '<', '/', 'g', '>', '𓏤']",80510,Nꜥw,PROPN,Sich windender (Personifikation der Schlange)
153,4WVXFJZFLNAYHP3Y5O5SLWD7DA,2,5,4WVXFJZFLNAYHP3Y5O5SLWD7DA.2.5,nꜥw,nꜥw,𓈖𓂝𓅱<g>I14C</g>𓏤,"['𓈖', '𓂝', '𓅱', '<', 'g', '>', 'I', '1', '4', 'C', '<', '/', 'g', '>', '𓏤']",80510,Nꜥw,PROPN,Sich windender (Personifikation der Schlange)
200,67HZI45S3REA3LWVZOKJ6QJOIE,14,9,67HZI45S3REA3LWVZOKJ6QJOIE.14.9,nbi̯.n,nbi̯.n,𓈖𓎟𓃀<g>D107</g>𓈖,"['𓈖', '𓎟', '𓃀', '<', 'g', '>', 'D', '1', '0', '7', '<', '/', 'g', '>', '𓈖']",82520,nbi̯,VERB,schmelzen; gießen
204,67HZI45S3REA3LWVZOKJ6QJOIE,14,13,67HZI45S3REA3LWVZOKJ6QJOIE.14.13,nḏr.n,nḏr.n,𓈖𓇦𓂋<g>U19A</g>𓆱𓈖,"['𓈖', '𓇦', '𓂋', '<', 'g', '>', 'U', '1', '9', 'A', '<', '/', 'g', '>', '𓆱', '𓈖']",91630,nḏr,VERB,(Holz) bearbeiten; zimmern
206,67HZI45S3REA3LWVZOKJ6QJOIE,14,15,67HZI45S3REA3LWVZOKJ6QJOIE.14.15,b(w)n.wDU,bwn.wDU,𓃀𓈖𓏌𓅱<g>T86</g><g>T86</g>,"['𓃀', '𓈖', '𓏌', '𓅱', '<', 'g', '>', 'T', '8', '6', '<', '/', 'g', '>', '<', 'g', '>', 'T', '8', '6', '<', '/', 'g', '>']",55330,bwn,NOUN,Speerspitzen (des Fischspeeres)
"""

import pandas as pd
from io import StringIO

# Convert the string into a StringIO object
# This is only necessary because we presented the csv as a string not as a file that is loaded into the notebook
csv_data = StringIO(eg2_csv)

# Read the data into a pandas DataFrame
df = pd.read_csv(csv_data)

# Display the DataFrame
df

def split_tags(text):
    parts = []  # List to collect output of the function
    while '<g>' in text and '</g>' in text:
        pre, rest = text.split('<g>', 1)  # splits at the first <g> found
        tag_content, post = rest.split('</g>', 1)  # splits the rest at the first </g> found

        # adds elements before the first <g></g> tag to the List
        parts.extend(pre)

        #  adds element inside the first <g></g> tag to the List
        parts.append(tag_content)

        # text variable is set to remaining text
        text = post

    # After last tag found, the remainder of the text is split and added to the List
    parts.extend(text)
    return parts

def process_text(text):
    if pd.isna(text): # deals with NaN
        return []
    else:
        return split_tags(text)

# apply functions to every row of the column 'unicode_word'
df['unicode_splitted'] = df['unicode_word'].apply(process_text)
# delete obsolete column
df.drop('unicode', axis=1, inplace=True)

df
#df.to_csv("EG-TLA-example.csv")