{-# LANGUAGE BangPatterns #-}
-- |
-- Module         : Data.ByteString.Search.DFA
-- Copyright      : Daniel Fischer
-- Licence        : BSD3
-- Maintainer     : Daniel Fischer <daniel.is.fischer@googlemail.com>
-- Stability      : Provisional
-- Portability    : non-portable (BangPatterns)
--
-- Fast search of strict 'S.ByteString' values. Breaking, splitting and
-- replacing using a deterministic finite automaton.

module Data.ByteString.Search.DFA ( -- * Overview
                                    -- $overview

                                    -- ** Complexity and performance
                                    -- $complexity

                                    -- ** Partial application
                                    -- $partial

                                    -- * Finding substrings
                                    indices
                                  , nonOverlappingIndices
                                    -- * Breaking on substrings
                                  , breakOn
                                  , breakAfter
                                    -- * Replacing
                                  , replace
                                    -- * Splitting
                                  , split
                                  , splitKeepEnd
                                  , splitKeepFront
                                  ) where

import Data.ByteString.Search.Internal.Utils (automaton)
import Data.ByteString.Search.Substitution

import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Internal as LI
import Data.ByteString.Unsafe (unsafeIndex)

import Data.Array.Base (unsafeAt)
--import Data.Array.Unboxed

import Data.Bits

-- $overview
--
-- This module provides functions related to searching a substring within
-- a string. The searching algorithm uses a deterministic finite automaton
-- based on the Knuth-Morris-Pratt algorithm.
-- The automaton is implemented as an array of @(patternLength + 1) * &#963;@
-- state transitions, where &#963; is the alphabet size (256), so it is
-- only suitable for short enough patterns.
--
-- When searching a pattern in a UTF-8-encoded 'S.ByteString', be aware that
-- these functions work on bytes, not characters, so the indices are
-- byte-offsets, not character offsets.

-- $complexity
--
-- The time and space complexity of the preprocessing phase is
-- /O/(@patternLength * &#963;@).
-- The searching phase is /O/(@targetLength@), each target character is
-- inspected only once.
--
-- In general the functions in this module are slightly faster than the
-- corresponding functions using the Knuth-Morris-Pratt algorithm but
-- considerably slower than the Boyer-Moore functions. For very short
-- patterns or, in the case of 'indices', patterns with a short period
-- which occur often, however, times are close to or even below the
-- Boyer-Moore times.

-- $partial
--
-- All functions can usefully be partially applied. Given only a pattern,
-- the automaton is constructed only once, allowing efficient re-use.

------------------------------------------------------------------------------
--                            Exported Functions                            --
------------------------------------------------------------------------------

-- | @'indices'@ finds the starting indices of all possibly overlapping
--   occurrences of the pattern in the target string.
--   If the pattern is empty, the result is @[0 .. 'length' target]@.
{-# INLINE indices #-}
indices :: S.ByteString     -- ^ Pattern to find
        -> S.ByteString     -- ^ String to search
        -> [Int]            -- ^ Offsets of matches
indices :: ByteString -> ByteString -> [Int]
indices = Bool -> ByteString -> ByteString -> [Int]
strictSearcher Bool
True

-- | @'nonOverlappingIndices'@ finds the starting indices of all
--   non-overlapping occurrences of the pattern in the target string.
--   It is more efficient than removing indices from the list produced
--   by 'indices'.
{-# INLINE nonOverlappingIndices #-}
nonOverlappingIndices :: S.ByteString   -- ^ Pattern to find
                      -> S.ByteString   -- ^ String to search
                      -> [Int]          -- ^ Offsets of matches
nonOverlappingIndices :: ByteString -> ByteString -> [Int]
nonOverlappingIndices = Bool -> ByteString -> ByteString -> [Int]
strictSearcher Bool
False

-- | @'breakOn' pattern target@ splits @target@ at the first occurrence
--   of @pattern@. If the pattern does not occur in the target, the
--   second component of the result is empty, otherwise it starts with
--   @pattern@. If the pattern is empty, the first component is empty.
--
-- @
--   'uncurry' 'S.append' . 'breakOn' pattern = 'id'
-- @
breakOn :: S.ByteString  -- ^ String to search for
        -> S.ByteString  -- ^ String to search in
        -> (S.ByteString, S.ByteString)
                         -- ^ Head and tail of string broken at substring
breakOn :: ByteString -> ByteString -> (ByteString, ByteString)
breakOn ByteString
pat = ByteString -> (ByteString, ByteString)
breaker
  where
    searcher :: ByteString -> [Int]
searcher = Bool -> ByteString -> ByteString -> [Int]
strictSearcher Bool
False ByteString
pat
    breaker :: ByteString -> (ByteString, ByteString)
breaker ByteString
str = case ByteString -> [Int]
searcher ByteString
str of
                    []      -> (ByteString
str, ByteString
S.empty)
                    (Int
i:[Int]
_)   -> Int -> ByteString -> (ByteString, ByteString)
S.splitAt Int
i ByteString
str

-- | @'breakAfter' pattern target@ splits @target@ behind the first occurrence
--   of @pattern@. An empty second component means that either the pattern
--   does not occur in the target or the first occurrence of pattern is at
--   the very end of target. To discriminate between those cases, use e.g.
--   'S.isSuffixOf'.
--
-- @
--   'uncurry' 'S.append' . 'breakAfter' pattern = 'id'
-- @
breakAfter :: S.ByteString  -- ^ String to search for
           -> S.ByteString  -- ^ String to search in
           -> (S.ByteString, S.ByteString)
                            -- ^ Head and tail of string broken after substring
breakAfter :: ByteString -> ByteString -> (ByteString, ByteString)
breakAfter ByteString
pat = ByteString -> (ByteString, ByteString)
breaker
  where
    !patLen :: Int
patLen = ByteString -> Int
S.length ByteString
pat
    searcher :: ByteString -> [Int]
searcher = Bool -> ByteString -> ByteString -> [Int]
strictSearcher Bool
False ByteString
pat
    breaker :: ByteString -> (ByteString, ByteString)
breaker ByteString
str = case ByteString -> [Int]
searcher ByteString
str of
                    []      -> (ByteString
str, ByteString
S.empty)
                    (Int
i:[Int]
_)   -> Int -> ByteString -> (ByteString, ByteString)
S.splitAt (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
patLen) ByteString
str


-- | @'replace' pat sub text@ replaces all (non-overlapping) occurrences of
--   @pat@ in @text@ with @sub@. If occurrences of @pat@ overlap, the first
--   occurrence that does not overlap with a replaced previous occurrence
--   is substituted. Occurrences of @pat@ arising from a substitution
--   will not be substituted. For example:
--
-- @
--   'replace' \"ana\" \"olog\" \"banana\" = \"bologna\"
--   'replace' \"ana\" \"o\" \"bananana\" = \"bono\"
--   'replace' \"aab\" \"abaa\" \"aaabb\" = \"aabaab\"
-- @
--
--   The result is a /lazy/ 'L.ByteString',
--   which is lazily produced, without copying.
--   Equality of pattern and substitution is not checked, but
--
-- @
--   'S.concat' . 'L.toChunks' $ 'replace' pat pat text == text
-- @
--
--   holds. If the pattern is empty but not the substitution, the result
--   is equivalent to (were they 'String's) @'cycle' sub@.
--
--   For non-empty @pat@ and @sub@ a strict 'S.ByteString',
--
-- @
--   'L.fromChunks' . 'Data.List.intersperse' sub . 'split' pat = 'replace' pat sub
-- @
--
--   and analogous relations hold for other types of @sub@.
replace :: Substitution rep
        => S.ByteString     -- ^ Substring to replace
        -> rep              -- ^ Replacement string
        -> S.ByteString     -- ^ String to modify
        -> L.ByteString     -- ^ Lazy result
replace :: forall rep.
Substitution rep =>
ByteString -> rep -> ByteString -> ByteString
replace ByteString
pat
  | ByteString -> Bool
S.null ByteString
pat = \rep
sub -> rep -> ByteString -> ByteString
forall a. Substitution a => a -> ByteString -> ByteString
prependCycle rep
sub (ByteString -> ByteString)
-> (ByteString -> ByteString) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ByteString -> ByteString -> ByteString)
-> ByteString -> ByteString -> ByteString
forall a b c. (a -> b -> c) -> b -> a -> c
flip ByteString -> ByteString -> ByteString
LI.chunk ByteString
LI.Empty
  | Bool
otherwise =
    let !patLen :: Int
patLen = ByteString -> Int
S.length ByteString
pat
        searcher :: ByteString -> [Int]
searcher = Bool -> ByteString -> ByteString -> [Int]
strictSearcher Bool
False ByteString
pat
        repl :: p -> ByteString -> [ByteString]
repl p
sub =
          let {-# NOINLINE subst #-}
              !subst :: [ByteString] -> [ByteString]
subst = p -> [ByteString] -> [ByteString]
forall a. Substitution a => a -> [ByteString] -> [ByteString]
substitution p
sub
              replacer :: ByteString -> [ByteString]
replacer ByteString
str
                | ByteString -> Bool
S.null ByteString
str    = []
                | Bool
otherwise     =
                  case ByteString -> [Int]
searcher ByteString
str of
                    []              -> [ByteString
str]
                    (Int
i:[Int]
_)
                        | Int
i Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0    -> [ByteString] -> [ByteString]
subst ([ByteString] -> [ByteString]) -> [ByteString] -> [ByteString]
forall a b. (a -> b) -> a -> b
$ ByteString -> [ByteString]
replacer (Int -> ByteString -> ByteString
S.drop Int
patLen ByteString
str)
                        | Bool
otherwise -> Int -> ByteString -> ByteString
S.take Int
i ByteString
str ByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
: [ByteString] -> [ByteString]
subst
                                        (ByteString -> [ByteString]
replacer (Int -> ByteString -> ByteString
S.drop (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
patLen) ByteString
str))
          in ByteString -> [ByteString]
replacer
    in \rep
sub -> [ByteString] -> ByteString
L.fromChunks ([ByteString] -> ByteString)
-> (ByteString -> [ByteString]) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. rep -> ByteString -> [ByteString]
forall {p}. Substitution p => p -> ByteString -> [ByteString]
repl rep
sub

-- | @'split' pattern target@ splits @target@ at each (non-overlapping)
--   occurrence of @pattern@, removing @pattern@. If @pattern@ is empty,
--   the result is an infinite list of empty 'S.ByteString's, if @target@
--   is empty but not @pattern@, the result is an empty list, otherwise
--   the following relations hold:
--
-- @
--   'S.concat' . 'Data.List.intersperse' pat . 'split' pat = 'id',
--   'length' ('split' pattern target) ==
--               'length' ('nonOverlappingIndices' pattern target) + 1,
-- @
--
--   no fragment in the result contains an occurrence of @pattern@.
split :: S.ByteString   -- ^ Pattern to split on
      -> S.ByteString   -- ^ String to split
      -> [S.ByteString] -- ^ Fragments of string
split :: ByteString -> ByteString -> [ByteString]
split ByteString
pat
  | ByteString -> Bool
S.null ByteString
pat  = [ByteString] -> ByteString -> [ByteString]
forall a b. a -> b -> a
const (ByteString -> [ByteString]
forall a. a -> [a]
repeat ByteString
S.empty)
split ByteString
pat = ByteString -> [ByteString]
splitter
  where
    !patLen :: Int
patLen = ByteString -> Int
S.length ByteString
pat
    searcher :: ByteString -> [Int]
searcher = Bool -> ByteString -> ByteString -> [Int]
strictSearcher Bool
False ByteString
pat
    splitter :: ByteString -> [ByteString]
splitter ByteString
str
      | ByteString -> Bool
S.null ByteString
str = []
      | Bool
otherwise  = ByteString -> [ByteString]
splitter' ByteString
str
    splitter' :: ByteString -> [ByteString]
splitter' ByteString
str
      | ByteString -> Bool
S.null ByteString
str = [ByteString
S.empty]
      | Bool
otherwise  =
        case ByteString -> [Int]
searcher ByteString
str of
          []    -> [ByteString
str]
          (Int
i:[Int]
_) -> Int -> ByteString -> ByteString
S.take Int
i ByteString
str ByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
: ByteString -> [ByteString]
splitter' (Int -> ByteString -> ByteString
S.drop (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
patLen) ByteString
str)

-- | @'splitKeepEnd' pattern target@ splits @target@ after each (non-overlapping)
--   occurrence of @pattern@. If @pattern@ is empty, the result is an
--   infinite list of empty 'S.ByteString's, otherwise the following
--   relations hold:
--
-- @
--   'S.concat' . 'splitKeepEnd' pattern = 'id',
-- @
--
--   all fragments in the result except possibly the last end with
--   @pattern@, no fragment contains more than one occurrence of @pattern@.
splitKeepEnd :: S.ByteString    -- ^ Pattern to split on
             -> S.ByteString    -- ^ String to split
             -> [S.ByteString]  -- ^ Fragments of string
splitKeepEnd :: ByteString -> ByteString -> [ByteString]
splitKeepEnd ByteString
pat
  | ByteString -> Bool
S.null ByteString
pat = [ByteString] -> ByteString -> [ByteString]
forall a b. a -> b -> a
const (ByteString -> [ByteString]
forall a. a -> [a]
repeat ByteString
S.empty)
splitKeepEnd ByteString
pat = ByteString -> [ByteString]
splitter
  where
    !patLen :: Int
patLen = ByteString -> Int
S.length ByteString
pat
    searcher :: ByteString -> [Int]
searcher = Bool -> ByteString -> ByteString -> [Int]
strictSearcher Bool
False ByteString
pat
    splitter :: ByteString -> [ByteString]
splitter ByteString
str
      | ByteString -> Bool
S.null ByteString
str  = []
      | Bool
otherwise   =
        case ByteString -> [Int]
searcher ByteString
str of
          []    -> [ByteString
str]
          (Int
i:[Int]
_) -> Int -> ByteString -> ByteString
S.take (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
patLen) ByteString
str ByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
:
                        ByteString -> [ByteString]
splitter (Int -> ByteString -> ByteString
S.drop (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
patLen) ByteString
str)

-- | @'splitKeepFront'@ is like 'splitKeepEnd', except that @target@ is split
--   before each occurrence of @pattern@ and hence all fragments
--   with the possible exception of the first begin with @pattern@.
--   No fragment contains more than one non-overlapping occurrence
--   of @pattern@.
splitKeepFront :: S.ByteString    -- ^ Pattern to split on
               -> S.ByteString    -- ^ String to split
               -> [S.ByteString]  -- ^ Fragments of string
splitKeepFront :: ByteString -> ByteString -> [ByteString]
splitKeepFront ByteString
pat
  | ByteString -> Bool
S.null ByteString
pat  = [ByteString] -> ByteString -> [ByteString]
forall a b. a -> b -> a
const (ByteString -> [ByteString]
forall a. a -> [a]
repeat ByteString
S.empty)
splitKeepFront ByteString
pat = ByteString -> [ByteString]
splitter
  where
    !patLen :: Int
patLen = ByteString -> Int
S.length ByteString
pat
    searcher :: ByteString -> [Int]
searcher = Bool -> ByteString -> ByteString -> [Int]
strictSearcher Bool
False ByteString
pat
    splitter :: ByteString -> [ByteString]
splitter ByteString
str
      | ByteString -> Bool
S.null ByteString
str  = []
      | Bool
otherwise   =
        case ByteString -> [Int]
searcher ByteString
str of
          []            -> [ByteString
str]
          (Int
i:[Int]
rst)
            | Int
i Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0    -> case [Int]
rst of
                             []     -> [ByteString
str]
                             (Int
j:[Int]
_)  -> Int -> ByteString -> ByteString
S.take Int
j ByteString
str ByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
: ByteString -> [ByteString]
splitter' (Int -> ByteString -> ByteString
S.drop Int
j ByteString
str)
            | Bool
otherwise -> Int -> ByteString -> ByteString
S.take Int
i ByteString
str ByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
: ByteString -> [ByteString]
splitter' (Int -> ByteString -> ByteString
S.drop Int
i ByteString
str)
    splitter' :: ByteString -> [ByteString]
splitter' ByteString
str
      | ByteString -> Bool
S.null ByteString
str  = []
      | Bool
otherwise   =
        case ByteString -> [Int]
searcher (Int -> ByteString -> ByteString
S.drop Int
patLen ByteString
str) of
          []    -> [ByteString
str]
          (Int
i:[Int]
_) -> Int -> ByteString -> ByteString
S.take (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
patLen) ByteString
str ByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
:
                        ByteString -> [ByteString]
splitter' (Int -> ByteString -> ByteString
S.drop (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
patLen) ByteString
str)

------------------------------------------------------------------------------
--                            Searching Function                            --
------------------------------------------------------------------------------

strictSearcher :: Bool -> S.ByteString -> S.ByteString -> [Int]
strictSearcher :: Bool -> ByteString -> ByteString -> [Int]
strictSearcher Bool
_ !ByteString
pat
    | ByteString -> Bool
S.null ByteString
pat = Int -> Int -> [Int]
forall a. Enum a => a -> a -> [a]
enumFromTo Int
0 (Int -> [Int]) -> (ByteString -> Int) -> ByteString -> [Int]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Int
S.length
    | ByteString -> Int
S.length ByteString
pat Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
1 = let !w :: Word8
w = ByteString -> Word8
S.head ByteString
pat in Word8 -> ByteString -> [Int]
S.elemIndices Word8
w
strictSearcher !Bool
overlap ByteString
pat = ByteString -> [Int]
search
  where
    !patLen :: Int
patLen = ByteString -> Int
S.length ByteString
pat
    !auto :: UArray Int Int
auto   = ByteString -> UArray Int Int
automaton ByteString
pat
    !p0 :: Word8
p0     = ByteString -> Int -> Word8
unsafeIndex ByteString
pat Int
0
    !ams :: Int
ams    = if Bool
overlap then Int
patLen else Int
0
    search :: ByteString -> [Int]
search ByteString
str = Int -> Int -> [Int]
match Int
0 Int
0
      where
        !strLen :: Int
strLen = ByteString -> Int
S.length ByteString
str
        {-# INLINE strAt #-}
        strAt :: Int -> Int
        strAt :: Int -> Int
strAt !Int
i = Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (ByteString -> Int -> Word8
unsafeIndex ByteString
str Int
i)
        match :: Int -> Int -> [Int]
match Int
0 Int
idx
          | Int
idx Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
strLen               = []
          | ByteString -> Int -> Word8
unsafeIndex ByteString
str Int
idx Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
p0   = Int -> Int -> [Int]
match Int
1 (Int
idx Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
          | Bool
otherwise                   = Int -> Int -> [Int]
match Int
0 (Int
idx Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
        match Int
state Int
idx
          | Int
idx Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
strLen   = []
          | Bool
otherwise       =
            let !nstate :: Int
nstate = UArray Int Int -> Int -> Int
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> Int -> e
unsafeAt UArray Int Int
auto ((Int
state Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
8) Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int -> Int
strAt Int
idx)
                !nxtIdx :: Int
nxtIdx = Int
idx Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
            in if Int
nstate Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
patLen
                then (Int
nxtIdx Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
patLen) Int -> [Int] -> [Int]
forall a. a -> [a] -> [a]
: Int -> Int -> [Int]
match Int
ams Int
nxtIdx
                else Int -> Int -> [Int]
match Int
nstate Int
nxtIdx