Descriptive.php 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766
  1. <?php
  2. namespace MathPHP\Statistics;
  3. use MathPHP\Exception;
  4. /**
  5. * Descriptive statistics
  6. * Summary statistics that quantitatively describe or summarize features of a collection of information.
  7. * https://en.wikipedia.org/wiki/Descriptive_statistics
  8. */
  9. class Descriptive
  10. {
  11. public const POPULATION = true;
  12. public const SAMPLE = false;
  13. /**
  14. * Range - the difference between the largest and smallest values
  15. * It is the size of the smallest interval which contains all the data.
  16. * It provides an indication of statistical dispersion.
  17. * (https://en.wikipedia.org/wiki/Range_(statistics))
  18. *
  19. * R = max x - min x
  20. *
  21. * @param float[] $numbers
  22. *
  23. * @return float
  24. *
  25. * @throws Exception\BadDataException if the input array of numbers is empty
  26. */
  27. public static function range(array $numbers): float
  28. {
  29. if (empty($numbers)) {
  30. throw new Exception\BadDataException('Cannot find the range of an empty list of numbers');
  31. }
  32. return \max($numbers) - \min($numbers);
  33. }
  34. /**
  35. * Midrange - the mean of the largest and smallest values
  36. * It is the midpoint of the range; as such, it is a measure of central tendency.
  37. * (https://en.wikipedia.org/wiki/Mid-range)
  38. *
  39. * max x + min x
  40. * M = -------------
  41. * 2
  42. *
  43. * @param float[] $numbers
  44. *
  45. * @return float
  46. *
  47. * @throws Exception\BadDataException if the input array of numbers is empty
  48. */
  49. public static function midrange(array $numbers): float
  50. {
  51. if (empty($numbers)) {
  52. throw new Exception\BadDataException('Cannot find the midrange of an empty list of numbers');
  53. }
  54. return Average::mean([\min($numbers), \max($numbers)]);
  55. }
  56. /**
  57. * Variance
  58. *
  59. * Variance measures how far a set of numbers are spread out.
  60. * A variance of zero indicates that all the values are identical.
  61. * Variance is always non-negative: a small variance indicates that the data points
  62. * tend to be very close to the mean (expected value) and hence to each other.
  63. * A high variance indicates that the data points are very spread out around the mean
  64. * and from each other.
  65. * (https://en.wikipedia.org/wiki/Variance)
  66. *
  67. * ∑⟮xᵢ - μ⟯²
  68. * σ² = ----------
  69. * ν
  70. *
  71. * Generalized method that allows setting the degrees of freedom.
  72. * For population variance, set d.f. (ν) to n
  73. * For sample variance, set d.f (ν) to n - 1
  74. * Or use populationVariance or sampleVariance convenience methods.
  75. *
  76. * μ is the population mean
  77. * ν is the degrees of freedom, which usually is
  78. * the number of numbers in the population set or n - 1 for sample set.
  79. *
  80. * @param float[] $numbers
  81. * @param int $ν degrees of freedom
  82. *
  83. * @return float
  84. *
  85. * @throws Exception\BadDataException if the input array of numbers is empty
  86. * @throws Exception\OutOfBoundsException if degrees of freedom is ≤ 0
  87. */
  88. public static function variance(array $numbers, int $ν): float
  89. {
  90. if (empty($numbers)) {
  91. throw new Exception\BadDataException('Cannot find the variance of an empty list of numbers');
  92. }
  93. if ($ν <= 0) {
  94. throw new Exception\OutOfBoundsException('Degrees of freedom must be > 0');
  95. }
  96. $∑⟮xᵢ − μ⟯² = RandomVariable::sumOfSquaresDeviations($numbers);
  97. return $∑⟮xᵢ − μ⟯² / $ν;
  98. }
  99. /**
  100. * Population variance - Use when all possible observations of the system are present.
  101. * If used with a subset of data (sample variance), it will be a biased variance.
  102. *
  103. * ∑⟮xᵢ - μ⟯²
  104. * σ² = ----------
  105. * N
  106. *
  107. * μ is the population mean
  108. * N is the number of numbers in the population set
  109. *
  110. * @param float[] $numbers
  111. *
  112. * @return float
  113. *
  114. * @throws Exception\BadDataException if the input array of numbers is empty
  115. * @throws Exception\OutOfBoundsException if degrees of freedom is ≤ 0
  116. */
  117. public static function populationVariance(array $numbers): float
  118. {
  119. $N = \count($numbers);
  120. return self::variance($numbers, $N);
  121. }
  122. /**
  123. * Unbiased sample variance
  124. * Use when only a subset of all possible observations of the system are present.
  125. *
  126. * ∑⟮xᵢ - x̄⟯²
  127. * S² = ----------
  128. * n - 1
  129. *
  130. * x̄ is the sample mean
  131. * n is the number of numbers in the sample set
  132. *
  133. * @param float[] $numbers
  134. *
  135. * @return float
  136. *
  137. * @throws Exception\BadDataException if the input array of numbers is empty
  138. * @throws Exception\OutOfBoundsException if degrees of freedom is ≤ 0
  139. */
  140. public static function sampleVariance(array $numbers): float
  141. {
  142. if (\count($numbers) == 1) {
  143. return 0;
  144. }
  145. $n = \count($numbers);
  146. return self::variance($numbers, $n - 1);
  147. }
  148. /**
  149. * Weighted sample variance
  150. *
  151. * Biased case
  152. *
  153. * ∑wᵢ⟮xᵢ - μw⟯²
  154. * σ²w = ----------
  155. * ∑wᵢ
  156. *
  157. * Unbiased estimator for frequency weights
  158. *
  159. * ∑wᵢ⟮xᵢ - μw⟯²
  160. * σ²w = ----------
  161. * ∑wᵢ - 1
  162. *
  163. * μw is the weighted mean
  164. *
  165. * https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance
  166. *
  167. * @param float[] $numbers
  168. * @param float[] $weights
  169. * @param bool $biased
  170. *
  171. * @return float
  172. *
  173. * @throws Exception\BadDataException if the number of numbers and weights are not equal
  174. */
  175. public static function weightedSampleVariance(array $numbers, array $weights, bool $biased = false): float
  176. {
  177. if (\count($numbers) === 1) {
  178. return 0;
  179. }
  180. if (\count($numbers) !== \count($weights)) {
  181. throw new Exception\BadDataException('Numbers and weights must have the same number of elements.');
  182. }
  183. $μw = Average::weightedMean($numbers, $weights);
  184. $∑wᵢ⟮xᵢ − μw⟯² = \array_sum(\array_map(
  185. function ($xᵢ, $wᵢ) use ($μw) {
  186. return $wᵢ * \pow(($xᵢ - $μw), 2);
  187. },
  188. $numbers,
  189. $weights
  190. ));
  191. $∑wᵢ = $biased
  192. ? \array_sum($weights)
  193. : \array_sum($weights) - 1;
  194. return $∑wᵢ⟮xᵢ − μw⟯² / $∑wᵢ;
  195. }
  196. /**
  197. * Standard deviation
  198. * A measure that is used to quantify the amount of variation or dispersion of a set of data values.
  199. * A low standard deviation indicates that the data points tend to be close to the mean
  200. * (also called the expected value) of the set.
  201. * A high standard deviation indicates that the data points are spread out over a wider range of values.
  202. * (https://en.wikipedia.org/wiki/Standard_deviation)
  203. *
  204. * σ = √⟮σ²⟯ = √⟮variance⟯
  205. * SD+ = √⟮σ²⟯ = √⟮sample variance⟯
  206. *
  207. * @param float[] $numbers
  208. * @param bool $SD+ : true returns SD+ (uses population variance);
  209. * false returns SD (uses sample variance);
  210. * Default is false (SD (sample variance))
  211. *
  212. * @return float
  213. *
  214. * @throws Exception\BadDataException if the input array of numbers is empty
  215. * @throws Exception\OutOfBoundsException if degrees of freedom is ≤ 0
  216. */
  217. public static function standardDeviation(array $numbers, bool $SD+ = false): float
  218. {
  219. if (empty($numbers)) {
  220. throw new Exception\BadDataException('Cannot find the standard deviation of an empty list of numbers');
  221. }
  222. return $SD+
  223. ? \sqrt(self::populationVariance($numbers))
  224. : \sqrt(self::sampleVariance($numbers));
  225. }
  226. /**
  227. * sd - Standard deviation - convenience method
  228. *
  229. * @param float[] $numbers
  230. * @param bool $SD+ : true returns SD+ (uses population variance);
  231. * false returns SD (uses sample variance);
  232. * Default is false (SD (sample variance))
  233. *
  234. * @return float
  235. *
  236. * @throws Exception\BadDataException if the input array of numbers is empty
  237. * @throws Exception\OutOfBoundsException if degrees of freedom is ≤ 0
  238. */
  239. public static function sd(array $numbers, bool $SD+ = false): float
  240. {
  241. return self::standardDeviation($numbers, $SD+);
  242. }
  243. /**
  244. * MAD - mean absolute deviation
  245. *
  246. * The average of the absolute deviations from a central point.
  247. * It is a summary statistic of statistical dispersion or variability.
  248. * (https://en.wikipedia.org/wiki/Average_absolute_deviation)
  249. *
  250. * ∑|xᵢ - x̄|
  251. * MAD = ---------
  252. * N
  253. *
  254. * x̄ is the mean
  255. * N is the number of numbers in the population set
  256. *
  257. * @param float[] $numbers
  258. *
  259. * @return float
  260. *
  261. * @throws Exception\BadDataException if the input array of numbers is empty
  262. */
  263. public static function meanAbsoluteDeviation(array $numbers): float
  264. {
  265. if (empty($numbers)) {
  266. throw new Exception\BadDataException('Cannot find the mean absolute deviation of an empty list of numbers');
  267. }
  268. $x = Average::mean($numbers);
  269. $∑│xᵢ − x│ = \array_sum(\array_map(
  270. function ($xᵢ) use ($x) {
  271. return \abs($xᵢ - $x);
  272. },
  273. $numbers
  274. ));
  275. $N = \count($numbers);
  276. return $∑│xᵢ − x│ / $N;
  277. }
  278. /**
  279. * MAD - median absolute deviation
  280. *
  281. * The average of the absolute deviations from a central point.
  282. * It is a summary statistic of statistical dispersion or variability.
  283. * It is a robust measure of the variability of a univariate sample of quantitative data.
  284. * (https://en.wikipedia.org/wiki/Median_absolute_deviation)
  285. *
  286. * MAD = median(|xᵢ - x̄|)
  287. *
  288. * x̄ is the median
  289. *
  290. * @param float[] $numbers
  291. *
  292. * @return float
  293. *
  294. * @throws Exception\BadDataException if the input array of numbers is empty
  295. */
  296. public static function medianAbsoluteDeviation(array $numbers): float
  297. {
  298. if (empty($numbers)) {
  299. throw new Exception\BadDataException('Cannot find the median absolute deviation of an empty list of numbers');
  300. }
  301. $x = Average::median($numbers);
  302. return Average::median(\array_map(
  303. function ($xᵢ) use ($x) {
  304. return \abs($xᵢ - $x);
  305. },
  306. $numbers
  307. ));
  308. }
  309. /**
  310. * Quartiles
  311. * Three points that divide the data set into four equal groups, each group comprising a quarter of the data.
  312. * https://en.wikipedia.org/wiki/Quartile
  313. *
  314. * There are multiple methods for computing quartiles:
  315. * - Inclusive
  316. * - Exclusive
  317. *
  318. * @param float[] $numbers
  319. * @param string $method What quartile method to use (optional - default: exclusive)
  320. *
  321. * @return float[] (0%, Q1, Q2, Q3, 100%, IQR)
  322. *
  323. * @throws Exception\BadDataException if the input array of numbers is empty
  324. */
  325. public static function quartiles(array $numbers, string $method = 'exclusive'): array
  326. {
  327. switch (strtolower($method)) {
  328. case 'inclusive':
  329. return self::quartilesInclusive($numbers);
  330. case 'exclusive':
  331. return self::quartilesExclusive($numbers);
  332. default:
  333. return self::quartilesExclusive($numbers);
  334. }
  335. }
  336. /**
  337. * Quartiles - Exclusive method
  338. * Three points that divide the data set into four equal groups, each group comprising a quarter of the data.
  339. * https://en.wikipedia.org/wiki/Quartile
  340. *
  341. * 0% is smallest number
  342. * Q1 (25%) is first quartile (lower quartile, 25th percentile)
  343. * Q2 (50%) is second quartile (median, 50th percentile)
  344. * Q3 (75%) is third quartile (upper quartile, 75th percentile)
  345. * 100% is largest number
  346. * interquartile_range is the difference between the upper and lower quartiles. (IQR = Q₃ - Q₁)
  347. *
  348. * Method used
  349. * - Use the median to divide the ordered data set into two halves.
  350. * - If there are an odd number of data points in the original ordered data set, do not include the median
  351. * (the central value in the ordered list) in either half.
  352. * - If there are an even number of data points in the original ordered data set,
  353. * split this data set exactly in half.
  354. * - The lower quartile value is the median of the lower half of the data.
  355. * The upper quartile value is the median of the upper half of the data.
  356. *
  357. * This rule is employed by the TI-83 calculator boxplot and "1-Var Stats" functions.
  358. * This is the most basic method that is commonly taught in math textbooks.
  359. * It is "method 1" from Wikipedia.
  360. *
  361. * @param float[] $numbers
  362. *
  363. * @return array{
  364. * "0%": float,
  365. * "Q1": float,
  366. * "Q2": float,
  367. * "Q3": float,
  368. * "100%": float,
  369. * "IQR": float,
  370. * }
  371. *
  372. * @throws Exception\BadDataException if the input array of numbers is empty
  373. */
  374. public static function quartilesExclusive(array $numbers): array
  375. {
  376. if (empty($numbers)) {
  377. throw new Exception\BadDataException('Cannot find the quartiles of an empty list of numbers');
  378. }
  379. if (\count($numbers) === 1) {
  380. $number = \array_pop($numbers);
  381. return [
  382. '0%' => $number,
  383. 'Q1' => $number,
  384. 'Q2' => $number,
  385. 'Q3' => $number,
  386. '100%' => $number,
  387. 'IQR' => 0,
  388. ];
  389. }
  390. \sort($numbers);
  391. $length = \count($numbers);
  392. if ($length % 2 == 0) {
  393. $lower_half = \array_slice($numbers, 0, (int)($length / 2));
  394. $upper_half = \array_slice($numbers, (int)($length / 2));
  395. } else {
  396. $lower_half = \array_slice($numbers, 0, \intdiv($length, 2));
  397. $upper_half = \array_slice($numbers, \intdiv($length, 2) + 1);
  398. }
  399. $lower_quartile = Average::median($lower_half);
  400. $upper_quartile = Average::median($upper_half);
  401. return [
  402. '0%' => \min($numbers),
  403. 'Q1' => $lower_quartile,
  404. 'Q2' => Average::median($numbers),
  405. 'Q3' => $upper_quartile,
  406. '100%' => \max($numbers),
  407. 'IQR' => $upper_quartile - $lower_quartile,
  408. ];
  409. }
  410. /**
  411. * Quartiles - Inclusive method (R method)
  412. * Three points that divide the data set into four equal groups, each group comprising a quarter of the data.
  413. * https://en.wikipedia.org/wiki/Quartile
  414. *
  415. * 0% is smallest number
  416. * Q1 (25%) is first quartile (lower quartile, 25th percentile)
  417. * Q2 (50%) is second quartile (median, 50th percentile)
  418. * Q3 (75%) is third quartile (upper quartile, 75th percentile)
  419. * 100% is largest number
  420. * interquartile_range is the difference between the upper and lower quartiles. (IQR = Q₃ - Q₁)
  421. *
  422. * Method used
  423. * - Use the median to divide the ordered data set into two halves.
  424. * - If there are an odd number of data points in the original ordered data set,
  425. * include the median (the central value in the ordered list) in both halves.
  426. * - If there are an even number of data points in the original ordered data set,
  427. * split this data set exactly in half.
  428. * - The lower quartile value is the median of the lower half of the data.
  429. * The upper quartile value is the median of the upper half of the data.
  430. *
  431. * The values found by this method are also known as "Tukey's hinges".
  432. * This is the "method 2" from Wikipedia.
  433. *
  434. * @param float[] $numbers
  435. *
  436. * @return array{
  437. * "0%": float,
  438. * "Q1": float,
  439. * "Q2": float,
  440. * "Q3": float,
  441. * "100%": float,
  442. * "IQR": float,
  443. * }
  444. *
  445. * @throws Exception\BadDataException if the input array of numbers is empty
  446. */
  447. public static function quartilesInclusive(array $numbers): array
  448. {
  449. if (empty($numbers)) {
  450. throw new Exception\BadDataException('Cannot find the quartiles of an empty list of numbers');
  451. }
  452. \sort($numbers);
  453. $length = \count($numbers);
  454. if ($length % 2 == 0) {
  455. $lower_half = \array_slice($numbers, 0, (int)($length / 2));
  456. $upper_half = \array_slice($numbers, (int)($length / 2));
  457. } else {
  458. $lower_half = \array_slice($numbers, 0, \intdiv($length, 2));
  459. $upper_half = \array_slice($numbers, \intdiv($length, 2) + 1);
  460. // Add median to both halves
  461. $median = Average::median($numbers);
  462. \array_push($lower_half, $median);
  463. \array_unshift($upper_half, $median);
  464. }
  465. $lower_quartile = Average::median($lower_half);
  466. $upper_quartile = Average::median($upper_half);
  467. return [
  468. '0%' => \min($numbers),
  469. 'Q1' => $lower_quartile,
  470. 'Q2' => Average::median($numbers),
  471. 'Q3' => $upper_quartile,
  472. '100%' => \max($numbers),
  473. 'IQR' => $upper_quartile - $lower_quartile,
  474. ];
  475. }
  476. /**
  477. * IQR - Interquartile range (midspread, middle fifty)
  478. * A measure of statistical dispersion.
  479. * Difference between the upper and lower quartiles.
  480. * https://en.wikipedia.org/wiki/Interquartile_range
  481. *
  482. * IQR = Q₃ - Q₁
  483. *
  484. * @param float[] $numbers
  485. * @param string $method What quartile method to use (optional - default: exclusive)
  486. *
  487. * @return float
  488. *
  489. * @throws Exception\BadDataException if the input array of numbers is empty
  490. */
  491. public static function interquartileRange(array $numbers, string $method = 'exclusive'): float
  492. {
  493. return self::quartiles($numbers, $method)['IQR'];
  494. }
  495. /**
  496. * IQR - Interquartile range (midspread, middle fifty)
  497. * Convenience wrapper function for interquartileRange.
  498. *
  499. * @param float[] $numbers
  500. * @param string $method What quartile method to use (optional - default: exclusive)
  501. *
  502. * @return float
  503. *
  504. * @throws Exception\BadDataException if the input array of numbers is empty
  505. */
  506. public static function iqr(array $numbers, string $method = 'exclusive'): float
  507. {
  508. return self::quartiles($numbers, $method)['IQR'];
  509. }
  510. /**
  511. * Compute the P-th percentile of a list of numbers
  512. *
  513. * Linear interpolation between closest ranks method - Second variant, C = 1
  514. * P-th percentile (0 <= P <= 100) of a list of N ordered values (sorted from least to greatest)
  515. * Similar method used in NumPy and Excel
  516. * https://en.wikipedia.org/wiki/Percentile#Second_variant.2C_.7F.27.22.60UNIQ--postMath-00000043-QINU.60.22.27.7F
  517. *
  518. * P
  519. * x - --- (N - 1) + 1
  520. * 100
  521. *
  522. * P = percentile
  523. * N = number of elements in list
  524. *
  525. * ν(x) = νₓ + x%1(νₓ₊₁ - νₓ)
  526. *
  527. * ⌊x⌋ = integer part of x
  528. * x%1 = fraction part of x
  529. * νₓ = number in position x in sorted list of numbers
  530. * νₓ₊₁ = number in position x + 1 in sorted list of number
  531. *
  532. * @param float[] $numbers
  533. * @param float $P percentile to calculate
  534. *
  535. * @return float in list corresponding to P percentile
  536. *
  537. * @throws Exception\BadDataException if the input array of numbers is empty
  538. * @throws Exception\OutOfBoundsException if $P percentile is not between 0 and 100
  539. */
  540. public static function percentile(array $numbers, float $P): float
  541. {
  542. if (empty($numbers)) {
  543. throw new Exception\BadDataException('Cannot find the P-th percentile of an empty list of numbers');
  544. }
  545. if ($P < 0 || $P > 100) {
  546. throw new Exception\OutOfBoundsException('Percentile P must be between 0 and 100.');
  547. }
  548. $N = \count($numbers);
  549. if ($N === 1) {
  550. return \array_shift($numbers);
  551. }
  552. \sort($numbers);
  553. if ($P == 100) {
  554. return $numbers[$N - 1];
  555. }
  556. $x = ($P / 100) * ($N - 1) + 1;
  557. $⌊x⌋ = \intval($x);
  558. $x%1 = $x - $⌊x⌋;
  559. $νₓ = $numbers[$⌊x⌋ - 1];
  560. $νₓ₊₁ = $numbers[$⌊x⌋];
  561. return $νₓ + $x%1 * ($νₓ₊₁ - $νₓ);
  562. }
  563. /**
  564. * Midhinge
  565. * The average of the first and third quartiles and is thus a measure of location.
  566. * Equivalently, it is the 25% trimmed mid-range or 25% midsummary; it is an L-estimator.
  567. * https://en.wikipedia.org/wiki/Midhinge
  568. *
  569. * Midhinge = (first quartile, third quartile) / 2
  570. *
  571. * @param float[] $numbers
  572. *
  573. * @return float
  574. *
  575. * @throws Exception\BadDataException if the input array of numbers is empty
  576. */
  577. public static function midhinge(array $numbers): float
  578. {
  579. $quartiles = self::quartiles($numbers);
  580. $Q1 = $quartiles['Q1'];
  581. $Q2 = $quartiles['Q3'];
  582. return Average::mean([$Q1, $Q2]);
  583. }
  584. /**
  585. * Coefficient of variation (cᵥ)
  586. * Also known as relative standard deviation (RSD)
  587. *
  588. * A standardized measure of dispersion of a probability distribution or
  589. * frequency distribution. It is often expressed as a percentage.
  590. * The ratio of the standard deviation to the mean.
  591. * https://en.wikipedia.org/wiki/Coefficient_of_variation
  592. *
  593. * σ
  594. * cᵥ = -
  595. * μ
  596. *
  597. * @param float[] $numbers
  598. *
  599. * @return float
  600. *
  601. * @throws Exception\BadDataException if the input array of numbers is empty
  602. * @throws Exception\OutOfBoundsException if degrees of freedom is ≤ 0
  603. */
  604. public static function coefficientOfVariation(array $numbers): float
  605. {
  606. $σ = self::standardDeviation($numbers);
  607. $μ = Average::mean($numbers);
  608. return $σ / $μ;
  609. }
  610. /**
  611. * Get a report of all the descriptive statistics over a list of numbers
  612. * Includes mean, median, mode, range, midrange, variance, standard deviation, quartiles, etc.
  613. *
  614. * @param float[] $numbers
  615. * @param bool $population : true means all possible observations of the system are present;
  616. * false means a sample is used.
  617. *
  618. * @return array{
  619. * n: int<0, max>,
  620. * min: float|false,
  621. * max: float|false,
  622. * mean: float,
  623. * median: float,
  624. * mode: float[],
  625. * range: float,
  626. * midrange: float,
  627. * variance: float,
  628. * sd: float,
  629. * cv: float,
  630. * mean_mad: float,
  631. * median_mad: float,
  632. * quartiles: float[],
  633. * midhinge: float,
  634. * skewness: float|null,
  635. * ses: float|null,
  636. * kurtosis: float|null,
  637. * sek: float|null,
  638. * sem: float,
  639. * ci_95: array{ci: float|null, lower_bound: float|null, upper_bound: float|null},
  640. * ci_99: array{ci: float|null, lower_bound: float|null, upper_bound: float|null},
  641. * }
  642. *
  643. * @throws Exception\OutOfBoundsException
  644. * @throws Exception\BadDataException
  645. */
  646. public static function describe(array $numbers, bool $population = false): array
  647. {
  648. $n = \count($numbers);
  649. $μ = Average::mean($numbers);
  650. $σ = self::standardDeviation($numbers, $population);
  651. return [
  652. 'n' => $n,
  653. 'min' => \min($numbers),
  654. 'max' => \max($numbers),
  655. 'mean' => $μ,
  656. 'median' => Average::median($numbers),
  657. 'mode' => Average::mode($numbers),
  658. 'range' => self::range($numbers),
  659. 'midrange' => self::midrange($numbers),
  660. 'variance' => $population ? self::populationVariance($numbers) : self::sampleVariance($numbers),
  661. 'sd' => $σ,
  662. 'cv' => $μ ? $σ / $μ : \NAN,
  663. 'mean_mad' => self::meanAbsoluteDeviation($numbers),
  664. 'median_mad' => self::medianAbsoluteDeviation($numbers),
  665. 'quartiles' => self::quartiles($numbers),
  666. 'midhinge' => self::midhinge($numbers),
  667. 'skewness' => $population
  668. ? ($n > 0 ? RandomVariable::populationSkewness($numbers) : null)
  669. : ($n >= 3 ? RandomVariable::skewness($numbers) : null),
  670. 'ses' => $n > 2 ? RandomVariable::ses($n) : null,
  671. 'kurtosis' => $population
  672. ? ($n > 3 ? RandomVariable::populationKurtosis($numbers) : null)
  673. : ($n > 0 ? RandomVariable::sampleKurtosis($numbers) : null),
  674. 'sek' => $n > 3 ? RandomVariable::sek($n) : null,
  675. 'sem' => RandomVariable::standardErrorOfTheMean($numbers),
  676. 'ci_95' => RandomVariable::confidenceInterval($μ, $n, $σ, '95'),
  677. 'ci_99' => RandomVariable::confidenceInterval($μ, $n, $σ, '99'),
  678. ];
  679. }
  680. /**
  681. * Five number summary
  682. * A descriptive statistic that provides information about a set of observations.
  683. * It consists of the five most important sample percentiles:
  684. * 1) the sample minimum (smallest observation)
  685. * 2) the lower quartile or first quartile
  686. * 3) the median (middle value)
  687. * 4) the upper quartile or third quartile
  688. * 5) the sample maximum (largest observation)
  689. *
  690. * https://en.wikipedia.org/wiki/Five-number_summary
  691. *
  692. * @param array<int|float> $numbers
  693. *
  694. * @return array{
  695. * min: float|int|false,
  696. * Q1: float,
  697. * median: float,
  698. * Q3: float,
  699. * max: float|int|false,
  700. * }
  701. *
  702. * @throws Exception\BadDataException
  703. */
  704. public static function fiveNumberSummary(array $numbers): array
  705. {
  706. $quartiles = self::quartiles($numbers);
  707. return [
  708. 'min' => \min($numbers),
  709. 'Q1' => $quartiles['Q1'],
  710. 'median' => Average::median($numbers),
  711. 'Q3' => $quartiles['Q3'],
  712. 'max' => \max($numbers),
  713. ];
  714. }
  715. }