//
//                     DFSee, Disk and Filesystem utility
//
//   Original code Copyright (c) 1994-2025 Fsys Software and Jan van Wijk
//
// ==========================================================================
//
//   DFSee, released under MIT License
//
//   Copyright (c) 1994-2025  Fsys Software and Jan Van Wijk
//
//   Permission is hereby granted, free of charge, to any person obtaining a copy
//   of this software and associated documentation files (the "Software"), to deal
//   in the Software without restriction, including without limitation the rights
//   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
//   copies of the Software, and to permit persons to whom the Software is
//   furnished to do so, subject to the following conditions:
//
//   The above copyright notice and this permission notice shall be included in all
//   copies or substantial portions of the Software.
//
//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
//   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
//   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
//   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
//   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
//   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
//   SOFTWARE.
//
//
//   Questions on DFSee licensing can be directed to: jvw@dfsee.com
//
// ==========================================================================
//
// DFSee UTF-8 (and related Unicode) string handling
//
// Author: J. van Wijk
//
// JvW  20-03-2019 Initial version, loosely derived from apfs-fuse
//

#include <txlib.h>                              // TxLib interface

#include <dfsutf8.h>                            // DFSee UTF-8 external interface
#include <dfsutf8t.h>                           // DFSee UTF-8 Unicode tables


// Add 32-bit UNI char to normalized array for all values, returning #added
static int dfsUtfNormalizeU32C                  // RET   # of 32bit chars added
(
   BOOL                caseInsensitive,         // IN    Case-insensitive
   ULONG               c32,                     // IN    32-bit UNI character
   ULONG               utf32[],                 // INOUT UTF-32 normalized array
   BYTE                refs8[]                  // INOUT UTF-8  reference  array
);

// Add 32-bit UNI char to normalized array for AC00 index, returning #added
static int dfsUtfNormalizeAC00                  // RET   # of 32bit chars added
(
   ULONG               c32,                     // IN    32-bit UNI character
   ULONG               utf32[],                 // INOUT UTF-32 normalized array
   BYTE                utf8[]                   // INOUT UTF-8  reference  array
);

// Sort array of UTF32 characters in sortorder defined by UTF8 values
static void dfsUtfSortCanonical
(
   ULONG               items,                   // IN    nr of items to sort
   ULONG               utf32[],                 // INOUT UTF-32 normalized array
   BYTE                refs8[]                  // INOUT UTF-8  reference  array
);


/*****************************************************************************/
// Compare two UTF8 strings in a strcmp compatible way
/*****************************************************************************/
int dfsUtf8Cmp                                  // RET   0 = EQ  1 = 1st larger
(                                               //              -1 = 1st smaller
   BYTE               *str1,                    // IN    UTF-8  coded string
   BYTE               *str2,                    // IN    UTF-8  coded string
   BOOL                caseInsensitive          // IN    Case-insensitive
)
{
   int                 rc = 0;                  // function return

   ENTER();

   //- to be implemented properly (from fuse ?)

   if (caseInsensitive)
   {
      rc = strcasecmp( (char *) str1, (char *) str2);
   }
   else
   {
      rc = strcmp( (char *) str1, (char *) str2);
   }
   RETURN (rc);
}                                               // end 'dfsUtf8Cmp'
/*---------------------------------------------------------------------------*/


/*****************************************************************************/
// Convert UTF-8 string to UTF-32 string (standard Unicode codepoints)
/*****************************************************************************/
ULONG dfsUtf8ToUtf32                            // RET   #UTF-32 chars or 0
(
   BYTE               *utf8,                    // IN    UTF-8  coded string
   ULONG               size32,                  // IN    Size of UTF-32 array
   ULONG               utf32[]                  // OUT   UTF-32 coded string
)
{
   ULONG               rc  = 0;                 // function return # UTF-32
   int                 mbC;                     // multi-byte count UTF-8
   BYTE               *s8  = utf8;              // UTF-8 string walker
   BYTE                c8;                      // UTF-8  character
   ULONG              *s32 = utf32;
   ULONG               c32;                     // UTF-32 character


   ENTER();
   TRACES(("size32:%u  utf32 at:%p  utf8: '%s'\n", size32, utf32, utf8));

   while ((c8 = *s8++) != 0)
   {
      if      (c8 < 0x80)                       // regular ASCII range
      {
         c32 = c8;                              // ALL 7 bits in byte-1
         mbC = 0;
      }
      else if (c8 < 0xC0)                       // invalid range in UTF-8!
      {
         break;
      }
      else if (c8 < 0xE0)                       // invalid range in UTF-8!
      {
         c32 = c8 & 0x1F;                       // first 5-bits in byte-1
         mbC = 1;                               // one more byte to read
      }
      else if (c8 < 0xF0)                       // invalid range in UTF-8!
      {
         c32 = c8 & 0x0F;                       // first 4-bits in byte-1
         mbC = 2;                               // two more bytes to read
      }
      else if (c8 < 0xF8)                       // invalid range in UTF-8!
      {
         c32 = c8 & 0x07;                       // first 3-bits in byte-1
         mbC = 3;                               // three more bytes to read
      }
      else                                      // invalid range in UTF-8!
      {
         break;
      }
      while (mbC--)                             // read rest of UTF-8 multi-bytes
      {
         c8 = *s8++;                            // read one byte
         if ((c8 & 0xC0) == 0x80)
         {
            c32 = (c32 << 6) | (c8 & 0x3F);     // add the six new bits (LSB)
         }
         else                                   // invalid continuation byte!
         {
            break;
         }
      }
      *s32++ = c32;
      rc++;                                     // one more UTF-32 char converted

      TRLEVX(100,("Add c32: 0x%8.8x\n", c32));
   }                                            // note that there is NO trailing ZERO!
   RETURN (rc);                                 // in the UTF32 stream, just net length
}                                               // end 'dfsUtf8ToUtf32'
/*---------------------------------------------------------------------------*/


/*****************************************************************************/
// Normalize and optional case-fold and UTF32 string, returning #chars
/*****************************************************************************/
ULONG dfsUtfNormCaseFoldStr                     // RET   Length of output string
(
   BOOL                caseInsensitive,         // IN    Case-insensitive
   ULONG               length,                  // IN    length of input string
   ULONG               utf32[],                 // IN    UTF-32 input string
   ULONG              *nfOut[]                  // OUT   normalized/folded string
)                                               //       (allocated) or NULL
{
   ULONG               rCount = 0;              // function return
   BYTE               *nfRef = NULL;            // UTF8  reference/sort   string
   ULONG              *nfStr = NULL;            // UTF32 allocated output string
   ULONG               c32;
   ULONG               k;
   int                 nfSize;                  // result of single-char norm/fold

   ENTER();
   TRACES(("caseIns:%s  length:%u  utf32:%p\n", (caseInsensitive) ? "YES" : "NO", length, utf32));

   if (((nfRef = (BYTE  *) TxAlloc( 4 * length, sizeof( BYTE) )) != NULL) && // allocate the
       ((nfStr = (ULONG *) TxAlloc( 4 * length, sizeof( ULONG))) != NULL)  ) // worst-case size
   {
      for (k = 0; k < length; k++)              // iterate over input string
      {
         c32 = utf32[ k];
         TRACES(("input at: %3u = 0x%8.8x  output at %3u\n", k, c32, rCount));
         nfSize = dfsUtfNormalizeU32C( caseInsensitive, c32, nfStr + rCount, nfRef + rCount);

         if (nfSize == -1)                      // failed to handle this char ...
         {
            rCount = 0;
            break;
         }
         rCount += nfSize;
      }

      #if defined (DUMP)
         if (TxTrLevel > 100)
         {
            TRACES(( "nfStr:"));
            for (k = 0; (k < rCount) && (k < 25); k++)
            {
               TRACES(( " %8.8x", nfStr[ k]));
            }
            TRACES(( "\n"));
            TRACES(( "nfRef:"));
            for (k = 0; (k < rCount) && (k < 25); k++)
            {
               TRACES(( "       %2.2hhx", nfRef[ k]));
            }
            TRACES(( "\n"));
         }
      #endif

      dfsUtfSortCanonical( rCount, nfStr, nfRef);

      #if defined (DUMP)
         if (TxTrLevel > 100)
         {
            TRACES(( "nfStr:"));
            for (k = 0; (k < rCount) && (k < 25); k++)
            {
               TRACES(( " %8.8x", nfStr[ k]));
            }
            TRACES(( "\n"));
            TRACES(( "nfRef:"));
            for (k = 0; (k < rCount) && (k < 25); k++)
            {
               TRACES(( "       %2.2hhx", nfRef[ k]));
            }
            TRACES(( "\n"));
         }
      #endif
   }

   TxFreeMem( nfRef);                           // free temporary array
   if (rCount == 0)
   {
      TxFreeMem( nfStr);                        // free incorrect output
   }
   *nfOut = nfStr;                              // return allocated output string
   RETURN (rCount);
}                                               // end 'dfsUtfNormCaseFoldStr'
/*---------------------------------------------------------------------------*/


/*****************************************************************************/
// Add 32-bit UNI char to normalized array for all values, returning #added
/*****************************************************************************/
static int dfsUtfNormalizeU32C                  // RET   # of 32bit chars added
(
   BOOL                caseInsensitive,         // IN    Case-insensitive
   ULONG               c32,                     // IN    32-bit UNI character
   ULONG               utf32[],                 // INOUT UTF-32 normalized array
   BYTE                refs8[]                  // INOUT UTF-8  reference  array
)
{
   int                 rc = 0;
   ULONG               ch_idx;
   USHORT              hi_res;
   USHORT              mi_res;
   USHORT              lo_res;
   USHORT             *seq_u16 = 0;
   ULONG              *seq_u32 = 0;
   ULONG               seq_len = 0;
   ULONG               cnt;
   ULONG               c;

   ENTER();
   TRACES(("Normalize c32: 0x%8.8x  utf32:%8.8x refs8:%p\n", c32, utf32, refs8));

   refs8[0] = 0;
   if (c32 >= 0xF0000)
   {
      if ((c32 & 0xFFFE) == 0xFFFE)
      {
         RETURN (-1);
      }
      else
      {
         utf32[0] = c32;
         TRACES(("Out 1st: 0x%8.8x ref:0x%2.2hhx\n", utf32[0], refs8[0]));
         RETURN (1);
      }
   }

   if (c32 < 0x2FB00)
   {
      ch_idx = c32;
   }
   else if ((c32 & 0xFFFFFE00) == 0xE0000)
   {
      ch_idx = c32 - 0xB0500;
   }
   else
   {
      RETURN (-1);
   }

   hi_res = nf_trie_hi[ch_idx >> 8];

   if (hi_res == 0xFFFF)
   {
      RETURN (-1);
   }
   if (hi_res == 0 || ((hi_res & 0xFF00) == 0xAD00))
   {
      utf32[0] = c32;
      refs8[0] = hi_res & 0xFF;
      TRACES(("Out 1st: 0x%8.8x ref:0x%2.2hhx\n", utf32[0], refs8[0]));
      RETURN (1);
   }

   if (hi_res == 0xAC00)
   {
      rc = dfsUtfNormalizeAC00( c32, utf32, refs8);
   }
   else
   {
      mi_res = nf_trie_mid[((hi_res & 0xFFF) << 4) | ((ch_idx >> 4) & 0xF)];

      if (mi_res == 0xFFFF)
      {
         RETURN (-1);
      }

      if (mi_res == 0xAC00)
      {
         rc = dfsUtfNormalizeAC00( c32, utf32, refs8);
      }
      else
      {
         if (mi_res == 0 || (mi_res & 0xFF00) == 0xAD00)
         {
            refs8[0] = mi_res & 0xFF;
            if (caseInsensitive && (c32 < 0x500))
            {
               utf32[0] = nf_basic_cf[c32];
            }
            else
            {
               utf32[0] = c32;
            }
            TRACES(("Out 1st: 0x%8.8x ref:0x%2.2hhx\n", utf32[0], refs8[0]));
            RETURN (1);
         }

         if ((mi_res & 0xFF00) == 0xAE00)
         {
            uint16_t mask = nf_u16_inv_masks[mi_res & 0xFF];
            if ((mask >> (ch_idx & 0xF)) & 1)
            {
               RETURN (-1);
            }
            if (caseInsensitive && (c32 < 0x500))
            {
               utf32[0] = nf_basic_cf[c32];
            }
            else
            {
               utf32[0] = c32;
            }
            TRACES(("Out 1st: 0x%8.8x ref:0x%2.2hhx\n", utf32[0], refs8[0]));
            RETURN (1);
         }

         lo_res = nf_trie_lo[((mi_res & 0xFFF) << 4) | (ch_idx & 0xF)];

         if (lo_res == 0xFFFF)
         {
            RETURN (-1);
         }

         if (lo_res == 0xAC00)
         {
            rc = dfsUtfNormalizeAC00( c32, utf32, refs8);
         }
         else
         {
            if (lo_res < 0xB000 || lo_res >= 0xF900)
            {
               if (lo_res == 0 || ((lo_res & 0xFF00) == 0xAD00))
               {
                  refs8[0] = lo_res & 0xFF;
               }
               else
               {
                  c32 = lo_res;
               }

               if (caseInsensitive && (c32 < 0x500))
               {
                  utf32[0] = nf_basic_cf[c32];
               }
               else
               {
                  utf32[0] = c32;
               }
               TRACES(("Out 1st: 0x%8.8x ref:0x%2.2hhx\n", utf32[0], refs8[0]));
               RETURN (1);
            }

            switch ((lo_res >> 12) & 0xF)
            {
               case 0xB:
                  if ((lo_res & 0x800) && !caseInsensitive)
                  {
                     utf32[0] = c32;
                     TRACES(("Out 1st: 0x%8.8x ref:0x%2.2hhx\n", utf32[0], refs8[0]));
                     RETURN (1);
                  }
                  seq_u16 = nf_u16_seq_2 + 2 * (lo_res & 0x7FF);
                  seq_len = 2;
                  break;

               case 0xC:
                  if ((lo_res & 0x800) && !caseInsensitive)
                  {
                     utf32[0] = c32;
                     TRACES(("Out 1st: 0x%8.8x ref:0x%2.2hhx\n", utf32[0], refs8[0]));
                     RETURN (1);
                  }
                  seq_u16 = nf_u16_seq_3 + 3 * (lo_res & 0x7FF);
                  seq_len = 3;
                  break;

               case 0xD:
                  seq_u16 = nf_u16_seq_misc + (lo_res & 0x3FF) + 1;
                  seq_len = nf_u16_seq_misc[lo_res & 0x3FF]; // Rest >> 4 in eax
                  refs8[0] = seq_len >> 4;
                  seq_len &= 0xF;
                  if (seq_len > 4)
                  {
                     RETURN (0);
                  }
                  break;

               case 0xE:
                  if ((lo_res & 0x800) && !caseInsensitive)
                  {
                     utf32[0] = c32;
                     TRACES(("Out 1st: 0x%8.8x ref:0x%2.2hhx\n", utf32[0], refs8[0]));
                     RETURN (1);
                  }
                  seq_u32 = nf_u32_char + (lo_res & 0x7FF);
                  seq_len = 1;
                  break;

               case 0xF:
                  seq_u32 = nf_u32_seq_misc + (lo_res & 0x3FF) + 1;
                  seq_len = nf_u32_seq_misc[lo_res & 0x3FF];
                  refs8[0] = seq_len >> 4;
                  seq_len &= 0xF;
                  if (seq_len > 4)
                  {
                     RETURN (0);
                  }
                  break;
            }

            for (cnt = 0; cnt < seq_len; cnt++)
            {
               if (seq_u16)
               {
                  c = seq_u16[cnt];
               }
               else
               {
                  c = seq_u32[cnt];
               }
               utf32[cnt] = c;

               if (cnt > 0)
               {
                  if (c >= 0xF0000)
                  {
                     refs8[cnt] = 0;
                     continue;
                  }

                  if (c == 0x3B9)
                  {
                     refs8[cnt] = 0xF0;
                     continue;
                  }

                  ch_idx = (c > 0x2FB00) ? (c - 0xB0500) : c;

                  hi_res = nf_trie_hi[ch_idx >> 8];

                  if (hi_res == 0 || ((hi_res & 0xFF00) == 0xAD00))
                  {
                     refs8[cnt] = hi_res & 0xFF;
                     continue;
                  }

                  mi_res = nf_trie_mid[((hi_res & 0xFFF) << 4) | ((ch_idx >> 4) & 0xF)];

                  if (mi_res == 0 || ((mi_res & 0xFF00) == 0xAE00))
                  {
                     refs8[cnt] = 0;
                     continue;
                  }
                  if ((mi_res & 0xFF00) == 0xAD00)
                  {
                     refs8[cnt] = mi_res & 0xFF;
                     continue;
                  }

                  lo_res = nf_trie_lo[((mi_res & 0xFFF) << 4) | (ch_idx & 0xF)];

                  if ((lo_res & 0xFF00) == 0xAD00)
                  {
                     refs8[cnt] = lo_res & 0xFF;
                  }
                  else
                  {
                     refs8[cnt] = 0;
                  }
               }
            }
            if (caseInsensitive)
            {
               if (utf32[0] < 0x500)
               {
                  utf32[0] = nf_basic_cf[utf32[0]];
               }
               if (cnt >= 2)
               {
                  if (utf32[cnt - 1] == 0x345)
                  {
                     utf32[cnt - 1] = 0x3B9;
                  }
               }
            }
            rc = cnt;

            #if defined (DUMP)
               if (TxTrLevel > 100)
               {
                  for (cnt = 0; cnt < seq_len; cnt++)
                  {
                     TRACES(("Multi-%u: 0x%8.8x ref:0x%2.2hhx\n", cnt, utf32[cnt], refs8[cnt]));
                  }
               }
            #endif
         }
      }
   }
   RETURN (rc);
}                                               // end 'dfsUtfNormalizeU32C'
/*---------------------------------------------------------------------------*/


#define UBASE_S   0xAC00
#define UBASE_L   0x1100
#define UBASE_V   0x1161
#define UBASE_T   0x11A7
#define UCOUNT_T   28
#define UCOUNT_N  (21 * UCOUNT_T)

/*****************************************************************************/
// Add 32-bit UNI char to normalized array for AC index, returning #added
/*****************************************************************************/
static int dfsUtfNormalizeAC00                 // RET   # of 32bit chars added
(
   ULONG               c32,                     // IN    32-bit UNI character
   ULONG               utf32[],                 // INOUT UTF-32 normalized array
   BYTE                refs8[]                  // INOUT UTF-8  reference  array
)
{
   int                 rc = 2;                  // function return
   int                 SIndex =  c32    - UBASE_S;
   int                 LIndex = (SIndex / UCOUNT_N);
   int                 VIndex = (SIndex % UCOUNT_N) / UCOUNT_T;
   int                 TIndex = (SIndex % UCOUNT_T);

   ENTER();

   utf32[ 0] = UBASE_L + LIndex;
   refs8[ 0] = 0;
   utf32[ 1] = UBASE_V + VIndex;
   refs8[ 1] = 0;

   TRACES(("AC001st: 0x%8.8x ref:0x%2.2hhx\n", utf32[0], refs8[0]));
   TRACES(("AC002nd: 0x%8.8x ref:0x%2.2hhx\n", utf32[0], refs8[0]));

   if (TIndex > 0)
   {
      utf32[ 2] = UBASE_T + TIndex;
      refs8[ 2] = 0;
      rc = 3;

      TRACES(("AC003rd: 0x%8.8x ref:0x%2.2hhx\n", utf32[0], refs8[0]));
   }
   RETURN (rc);
}                                               // end 'dfsUtfNormalizeAC00'
/*---------------------------------------------------------------------------*/


/*****************************************************************************/
// Bubble-Sort array of UTF32 characters in sortorder defined by UTF8 values
/*****************************************************************************/
static void dfsUtfSortCanonical
(
   ULONG               items,                   // IN    nr of items to sort
   ULONG               utf32[],                 // INOUT UTF-32 normalized array
   BYTE                refs8[]                  // INOUT UTF-8  reference  array
)
{
   int                 i;
   int                 k;
   BOOL                done;
   BYTE                sort_ref;
   ULONG               sort_c32;

   ENTER();
   TRACES(("Sorting %u items  utf32 at:%p   refs8 at:%p\n", items, utf32, refs8));

   for (k = 0; k < (items - 1); k++)
   {
      if ((refs8[ k] == 0) || (refs8[ k + 1] == 0))
      {
         continue;
      }

      do
      {
         for (done = TRUE, i = k; i < (items - 1) && refs8[ i + 1] != 0; i++)
         {
            if (refs8[ i] > refs8[i + 1])
            {
               done          = FALSE;
               sort_ref      = refs8[ i + 1];
               refs8[ i + 1] = refs8[ i];
               refs8[ i]     = sort_ref;
               sort_c32      = utf32[ i + 1];
               utf32[ i + 1] = utf32[ i];
               utf32[i]      = sort_c32;
            }
         }
      } while (!done);
      k = i + 1;
   }
   VRETURN ();
}                                               // end 'dfsUtfSortCanonical'
/*---------------------------------------------------------------------------*/

