How to map SQL collation setting to a Java comparator?

Question

Is there a way to translate a database's collation setting (e.g. SQL_Latin1_General_CP1_CI_AS) to a Java Comparator implementation so I can apply the same ordering as the database does, using in Java code?

Is there an existing library that already provides this mapping?

Gili · Accepted Answer

I ended up doing the following:

Query the current database's collation setting.
Next, parse the description of the collator into sub-components such as "case-insensitive" or "accent-sensitive".
Next, construct a Comparator corresponding to these rules

Enjoy!

/**
 * Returns the Comparator associated with the database's default collation.
 * 
 * Beware! Some databases sort unicode strings differently than
 * non-unicode strings, even for the same collation setting.
 * 

 * @param unicode true if the String being sorted is unicode, false otherwise
 * @return the Comparator associated with the database's default collation
 * @throws DatabaseException if an unexpected database error occurs
 */
public Comparator getComparator(boolean unicode)
    throws DatabaseException
{
    // @see http://stackoverflow.com/a/5072926/14731, http://stackoverflow.com/a/27052010/14731 and
    // http://stackoverflow.com/q/32209137/14731
    try (Connection connection = server.getDatasource().getConnection())
    {
        try (PreparedStatement statement = connection.prepareStatement(
            "SELECT description from sys.fn_HelpCollations()
" +
            "WHERE name = SERVERPROPERTY('collation')"))
        {
            try (ResultSet rs = statement.executeQuery())
            {
                if (!rs.next())
                    throw new ObjectNotFoundException(this);
                String description = rs.getString(1);
                List tokens = Arrays.asList(description.split(",\s*"));
                // Description format: language,property1,property2,...,propertyN,sorting,...
                ComparatorBuilder comparatorBuilder = new ComparatorBuilder();

                // Skip the language
                tokens = tokens.subList(1, tokens.size());
                // See https://technet.microsoft.com/en-US/library/ms143515(v=SQL.90).aspx for a list of possible tokens
                for (String token: tokens)
                {
                    if (token.toLowerCase().contains("sort"))
                    {
                        // Stop as soon as we hit information related to the sorting order
                        break;
                    }
                    switch (token)
                    {
                        case "case-insensitive":
                        {
                            comparatorBuilder.caseInsensitive(true);
                            break;
                        }
                        case "accent-insensitive":
                        {
                            comparatorBuilder.accentInsensitive(true);
                            break;
                        }
                        case "kanatype-insensitive":
                        {
                            comparatorBuilder.kanaInsensitive(true);
                            break;
                        }
                        case "width-insensitive":
                        case "width-insensitive for Unicode Data":
                        {
                            comparatorBuilder.widthInsensitive(true);
                            break;
                        }
                        case "case-sensitive":
                        case "accent-sensitive":
                        case "kanatype-sensitive":
                        case "width-sensitive":
                        {
                            // Do nothing, this is the default setting.
                            break;
                        }
                        default:
                            throw new AssertionError(String.format("Unexpected token: '%s'. Description: '%s'", token, description));
                    }
                }
                assert (!rs.next()): "Database returned more rows than expected";
                if (unicode)
                    comparatorBuilder.discardHyphens(true);
                return comparatorBuilder.build();
            }
        }
    }
    catch (SQLException e)
    {
        throw new DatabaseException(e);
    }
}

import com.ibm.icu.text.Transliterator;
import java.text.Normalizer;
import java.util.Comparator;

/**
 * Converts a database collation to a Java comparator.
 * 
 * @see https://msdn.microsoft.com/en-us/library/hh230914.aspx?f=255&MSPPError=-2147217396
 * @see http://zarez.net/?p=1893
 * @author Gili Tzabari
 */
class ComparatorBuilder
{
    // SQL Server: https://technet.microsoft.com/en-US/library/ms143515(v=SQL.90).aspx
    private boolean caseInsensitive = false;
    private boolean accentInsensitive = false;
    private boolean kanaInsensitive = false;
    private boolean widthInsensitive = false;
    /**
     * Indicates if hyphens should be discarded prior to sorting (default = false).
     */
    private boolean discardHyphens = false;

    /**
     * @return true if the comparator ignores the difference between uppercase and lowercase letters (default = false)
     */
    public boolean caseInsensitive()
    {
        return caseInsensitive;
    }

    /**
     * @param value true if the comparator ignores the difference between uppercase and lowercase letters
     * @return this
     */
    public ComparatorBuilder caseInsensitive(boolean value)
    {
        this.caseInsensitive = value;
        return this;
    }

    /**
     * @return true if the comparator ignores the difference between accented and unaccented characters (default = false)
     */
    public boolean accentInsensitive()
    {
        return accentInsensitive;
    }

    /**
     * @param value true if the comparator ignores the difference between accented and unaccented characters
     * @return this
     */
    public ComparatorBuilder accentInsensitive(boolean value)
    {
        this.accentInsensitive = value;
        return this;
    }

    /**
     * @return true if the comparator ignores the difference between the two types of Japanese kana characters: Hiragana
     *         and Katakana (default = false)
     */
    public boolean kanaInsensitive()
    {
        return kanaInsensitive;
    }

    /**
     * @param value true if the comparator ignores the difference between the two types of Japanese kana characters:
     *              Hiragana and Katakana
     * @return this
     */
    public ComparatorBuilder kanaInsensitive(boolean value)
    {
        this.kanaInsensitive = value;
        return this;
    }

    /**
     * @return true if the comparator ignores the difference between a single-byte character and the same character when
     *         represented as a double-byte character (default = false)
     */
    public boolean widthInsensitive()
    {
        return widthInsensitive;
    }

    /**
     * @param value true if the comparator ignores the difference between a single-byte character and the same character
     *              when represented as a double-byte character
     * @return this
     */
    public ComparatorBuilder widthInsensitive(boolean value)
    {
        this.widthInsensitive = value;
        return this;
    }

    /**
     * @return true if the comparator discards hyphens prior to sorting (default = false)
     */
    public boolean discardHyphens()
    {
        return discardHyphens;
    }

    /**
     * @param value true if comparator discards hyphens prior to sorting
     * @return this
     */
    public ComparatorBuilder discardHyphens(boolean value)
    {
        this.discardHyphens = value;
        return this;
    }

    /**
     * @return a Comparator instance
     */
    public Comparator build()
    {
        return (java.lang.String first, java.lang.String second) ->
        {
            String firstNormalized = first;
            String secondNormalized = second;
            if (discardHyphens)
            {
                firstNormalized = firstNormalized.replaceAll("-", "");
                secondNormalized = secondNormalized.replaceAll("-", "");
            }
            if (accentInsensitive)
            {
                // @see http://stackoverflow.com/a/3322174/14731
                firstNormalized = Normalizer.normalize(first, Normalizer.Form.NFD).replaceAll("[^\p{ASCII}]", "");
                secondNormalized = Normalizer.normalize(second, Normalizer.Form.NFD).replaceAll("[^\p{ASCII}]", "");
            }
            if (kanaInsensitive)
            {
                // @see http://stackoverflow.com/a/6577778/14731
                Transliterator transliterator = Transliterator.getInstance("Hiragana-Katakana");
                firstNormalized = transliterator.transliterate(firstNormalized);
                secondNormalized = transliterator.transliterate(secondNormalized);
            }
            if (widthInsensitive)
            {
                Transliterator transliterator = Transliterator.getInstance("Halfwidth-Fullwidth");
                firstNormalized = transliterator.transliterate(firstNormalized);
                secondNormalized = transliterator.transliterate(secondNormalized);
            }
            // Case-normalization is not as easy as it seems. See
            // http://mattryall.net/blog/2009/02/the-infamous-turkish-locale-bug and the implementation of
            // String.compareToIgnoreCase(). Better to delegate to a trusted implementation.
            if (caseInsensitive)
                return firstNormalized.compareToIgnoreCase(secondNormalized);
            else
                return firstNormalized.compareTo(secondNormalized);
        };
    }
}

How to map SQL collation setting to a Java comparator?

Answers (2)

Related Questions