/**
 * Split string according to Unicode standard annex UAX#29 word boundaries
 *
 * @see https://viax.atlassian.net/browse/VIAX-2823
 *
 * @param {string} string - String to split
 * @returns {array}
 */
import * as R from 'ramda';
import Unistring from 'unistring';
import { ALLOWED_TYPES_OF_WORDS } from './constants';

const splitToWords = R.pipe(
  // split string according to Unicode standard annex UAX#29 word boundaries
  // see http://unicode.org/reports/tr29/#Word_Boundaries
  Unistring.getWords,

  // leave only alphanumeric and numeric tokens
  // (mimicking behavior of the Lucene StandardTokenizer)
  // see https://lucene.apache.org/solr/guide/6_6/tokenizers.html#Tokenizers-StandardTokenizer
  R.filter(
    R.pipe(
      R.propOr('', 'type'),
      // eslint-disable-next-line no-underscore-dangle
      R.contains(R.__, ALLOWED_TYPES_OF_WORDS),
    ),
  ),
  R.map(R.prop('text')),
);

export default splitToWords;
