@@ -113,6 +113,17 @@ module Statistics.Sample.Fold
113113 -- * Joint distributions
114114 , covariance
115115 , correlation
116+
117+ -- ** Single pass Length, Mean, Variance, Skewness and Kurtosis
118+ -- $lmvsk
119+ , LMVSK (.. )
120+ , LMVSKState
121+ , fastLMVSK
122+ , fastLMVSKu
123+ , foldLMVSKState
124+ , getLMVSK
125+ , getLMVSKu
126+
116127 -- * Strict types and helpers
117128 , F. fold
118129 , biExpectation
@@ -522,6 +533,144 @@ correlation (muX, muY) =
522533 (\ (_,y) -> square (y - muY))
523534{-# INLINE correlation #-}
524535
536+ -- $lmvsk
537+ --
538+ -- 'LMVSK' represents the Length, Mean, Variance, Skewness
539+ -- and Kurtosis of a sample. If you need to compute several
540+ -- of these statistics at once, using these folds may be more
541+ -- efficient than combining the 'Fold's above Applicatively.
542+ --
543+ -- The algorithm used is similar to the found in 'fastVariance' etc.
544+ -- but common subexpressions are shared.
545+
546+ data LMVSK = LMVSK
547+ { lmvskLength :: {-# UNPACK #-}! Int
548+ , lmvskMean :: {-# UNPACK #-}! Double
549+ , lmvskVariance :: {-# UNPACK #-}! Double
550+ , lmvskSkewness :: {-# UNPACK #-}! Double
551+ , lmvskKurtosis :: {-# UNPACK #-}! Double
552+ } deriving (Show , Eq )
553+
554+ -- | Intermediate state for computing 'LMVSK'. This type's 'Semigroup'
555+ -- instance allows it to be computed in parallel and combined
556+ newtype LMVSKState = LMVSKState LMVSK
557+
558+ instance Monoid LMVSKState where
559+ {-# INLINE mempty #-}
560+ mempty = LMVSKState lmvskZero
561+ {-# INLINE mappend #-}
562+ mappend = (<>)
563+
564+ instance Semigroup LMVSKState where
565+ {-# INLINE (<>) #-}
566+ (LMVSKState (LMVSK an am1 am2 am3 am4)) <> (LMVSKState (LMVSK bn bm1 bm2 bm3 bm4))
567+ = LMVSKState (LMVSK n m1 m2 m3 m4) where
568+ fi :: Int -> Double
569+ fi = fromIntegral
570+ -- combined.n = a.n + b.n;
571+ n = an+ bn
572+ n2 = n* n
573+ nd = fi n
574+ nda = fi an
575+ ndb = fi bn
576+ -- delta = b.M1 - a.M1;
577+ delta = bm1 - am1
578+ -- delta2 = delta*delta;
579+ delta2 = delta* delta
580+ -- delta3 = delta*delta2;
581+ delta3 = delta* delta2
582+ -- delta4 = delta2*delta2;
583+ delta4 = delta2* delta2
584+ -- combined.M1 = (a.n*a.M1 + b.n*b.M1) / combined.n;
585+ m1 = (nda* am1 + ndb* bm1 ) / nd
586+ -- combined.M2 = a.M2 + b.M2 + delta2*a.n*b.n / combined.n;
587+ m2 = am2 + bm2 + delta2* nda* ndb / nd
588+ -- combined.M3 = a.M3 + b.M3 + delta3*a.n*b.n* (a.n - b.n)/(combined.n*combined.n);
589+ m3 = am3 + bm3 + delta3* nda* ndb* fi( an - bn )/ fi n2
590+ -- combined.M3 += 3.0*delta * (a.n*b.M2 - b.n*a.M2) / combined.n;
591+ + 3.0 * delta * (nda* bm2 - ndb* am2 ) / nd
592+ --
593+ -- combined.M4 = a.M4 + b.M4 + delta4*a.n*b.n * (a.n*a.n - a.n*b.n + b.n*b.n) /(combined.n*combined.n*combined.n);
594+ m4 = am4 + bm4 + delta4* nda* ndb * fi(an* an - an* bn + bn* bn ) / fi (n* n* n)
595+ -- combined.M4 += 6.0*delta2 * (a.n*a.n*b.M2 + b.n*b.n*a.M2)/(combined.n*combined.n) +
596+ + 6.0 * delta2 * (nda* nda* bm2 + ndb* ndb* am2) / fi n2
597+ -- 4.0*delta*(a.n*b.M3 - b.n*a.M3) / combined.n;
598+ + 4.0 * delta* (nda* bm3 - ndb* am3) / nd
599+
600+ -- | Efficiently compute the
601+ -- __length, mean, variance, skewness and kurtosis__ with a single pass.
602+ {-# INLINE fastLMVSK #-}
603+ fastLMVSK :: F. Fold Double LMVSK
604+ fastLMVSK = getLMVSK <$> foldLMVSKState
605+
606+ -- | Efficiently compute the
607+ -- __length, mean, unbiased variance, skewness and kurtosis__ with a single pass.
608+ {-# INLINE fastLMVSKu #-}
609+ fastLMVSKu :: F. Fold Double LMVSK
610+ fastLMVSKu = getLMVSKu <$> foldLMVSKState
611+
612+ -- | Initial 'LMVSKState'
613+ {-# INLINE lmvskZero #-}
614+ lmvskZero :: LMVSK
615+ lmvskZero = LMVSK 0 0 0 0 0
616+
617+ -- | Performs the heavy lifting of fastLMVSK. This is exposed
618+ -- because the internal `LMVSKState` is monoidal, allowing you
619+ -- to run these statistics in parallel over datasets which are
620+ -- split and then combine the results.
621+ {-# INLINE foldLMVSKState #-}
622+ foldLMVSKState :: F. Fold Double LMVSKState
623+ foldLMVSKState = F. Fold stepLMVSKState (LMVSKState lmvskZero) id
624+
625+ {-# INLINE stepLMVSKState #-}
626+ stepLMVSKState :: LMVSKState -> Double -> LMVSKState
627+ stepLMVSKState (LMVSKState (LMVSK n1 m1 m2 m3 m4)) x = LMVSKState $ LMVSK n m1' m2' m3' m4' where
628+ fi :: Int -> Double
629+ fi = fromIntegral
630+ -- long long n1 = n;
631+ -- n++;
632+ n = n1+ 1
633+ -- delta = x - M1;
634+ delta = x - m1
635+ -- delta_n = delta / n;
636+ delta_n = delta / fi n
637+ -- delta_n2 = delta_n * delta_n;
638+ delta_n2 = delta_n * delta_n
639+ -- term1 = delta * delta_n * n1;
640+ term1 = delta * delta_n * fi n1
641+ -- M1 += delta_n;
642+ m1' = m1 + delta_n
643+ -- M4 += term1 * delta_n2 * (n*n - 3*n + 3) + 6 * delta_n2 * M2 - 4 * delta_n * M3;
644+ m4' = m4 + term1 * delta_n2 * fi (n* n - 3 * n + 3 ) + 6 * delta_n2 * m2 - 4 * delta_n * m3
645+ -- M3 += term1 * delta_n * (n - 2) - 3 * delta_n * M2;
646+ m3' = m3 + term1 * delta_n * fi (n - 2 ) - 3 * delta_n * m2
647+ -- M2 += term1;
648+ m2' = m2 + term1
649+
650+ -- | Returns the stats which have been computed in a LMVSKState with /population/ variance.
651+ getLMVSK :: LMVSKState -> LMVSK
652+ getLMVSK (LMVSKState (LMVSK n m1 m2 m3 m4)) = LMVSK n m1 m2' m3' m4' where
653+ nd = fromIntegral n
654+ -- M2/(n-1.0)
655+ m2' = m2 / nd
656+ -- sqrt(double(n)) * M3/ pow(M2, 1.5)
657+ m3' = sqrt nd * m3 / (m2 ** 1.5 )
658+ -- double(n)*M4 / (M2*M2) - 3.0
659+ m4' = nd* m4 / (m2* m2) - 3.0
660+
661+ -- | Returns the stats which have been computed in a LMVSKState,
662+ -- with the /sample/ variance.
663+ getLMVSKu :: LMVSKState -> LMVSK
664+ getLMVSKu (LMVSKState (LMVSK n m1 m2 m3 m4)) = LMVSK n m1 m2' m3' m4' where
665+ nd = fromIntegral n
666+ -- M2/(n-1.0)
667+ m2' = m2 / (nd- 1 )
668+ -- sqrt(double(n)) * M3/ pow(M2, 1.5)
669+ m3' = sqrt nd * m3 / (m2 ** 1.5 )
670+ -- double(n)*M4 / (M2*M2) - 3.0
671+ m4' = nd* m4 / (m2* m2) - 3.0
672+
673+
525674--------
526675
527676-- $references
@@ -542,3 +691,6 @@ correlation (muX, muY) =
542691-- * West, D.H.D. (1979) Updating mean and variance estimates: an
543692-- improved method. /Communications of the ACM/
544693-- 22(9):532–535. <http://doi.acm.org/10.1145/359146.359153>
694+ --
695+ -- * John D. Cook. Computing skewness and kurtosis in one pass
696+ -- <http://www.johndcook.com/blog/skewness_kurtosis/>
0 commit comments