ANOVA.php 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591
  1. <?php
  2. namespace MathPHP\Statistics;
  3. use MathPHP\Probability\Distribution\Continuous\F;
  4. use MathPHP\Exception;
  5. /**
  6. * ANOVA (Analysis of Variance)
  7. */
  8. class ANOVA
  9. {
  10. /**
  11. * One-way ANOVA
  12. * Technique used to compare means of three or more samples
  13. * (using the F distribution).
  14. * https://en.wikipedia.org/wiki/One-way_analysis_of_variance
  15. *
  16. * Produces the following analysis of the data:
  17. *
  18. * ANOVA hypothesis test summary data
  19. *
  20. * | SS | df | MS | F | P |
  21. * Treatment | | | | | |
  22. * Error | | | |
  23. * Total | | |
  24. *
  25. * where:
  26. * Treament is between groups
  27. * Error is within groups
  28. * SS = Sum of squares
  29. * df = Degrees of freedom
  30. * MS = Mean squares
  31. * F = F statistic
  32. * P = P value
  33. *
  34. * Data summary table
  35. *
  36. * | N | Sum | Mean | SS | Variance | SD | SEM |
  37. * 0 | | | | | | | |
  38. * 1 | | | | | | | |
  39. * ... | | | | | | | |
  40. * Total | | | | | | | |
  41. *
  42. * where:
  43. * Each row is the summary for a sample, numbered from 0 to m - 1
  44. * m = Number of samples
  45. * N = Sample size
  46. * SS = Sum of squares
  47. * SD = Standard deviation
  48. * SEM = Standard error of the mean
  49. *
  50. * Calculations
  51. *
  52. * Sum of Squares
  53. * SST (sum of squares total)
  54. * ∑⟮xᵢ − μ⟯²
  55. * where:
  56. * xᵢ = each element of all samples
  57. * μ = mean total of all elements of all samples
  58. *
  59. * SSB (sum of squares between - treatment)
  60. * ∑n(x - μ)²
  61. * where:
  62. * n = sample size
  63. * x = sample mean
  64. * μ = mean total of all elements of all samples
  65. *
  66. * SSW (sum of squares within - error)
  67. * ∑∑⟮xᵢ − μ⟯² Sum of sum of squared deviations of each sample
  68. * where:
  69. * xᵢ = each element of the sample
  70. * μ = mean of the sample
  71. *
  72. * Degrees of Freedom
  73. * dfT (degrees of freedom for the total)
  74. * mn - 1
  75. *
  76. * dfB (degrees of freedom between - treatment)
  77. * m - 1
  78. *
  79. * dfW (degrees of freedom within - error)
  80. * m(n - 1)
  81. *
  82. * where:
  83. * m = number of samples
  84. * n = number of elements in each sample
  85. *
  86. * Mean Squares
  87. * MSB (Mean squares between - treatment)
  88. * SSB / dfB
  89. *
  90. * MSW (Mean squares within - error)
  91. * SSW / dfW
  92. *
  93. * Test Statistics
  94. * F = MSB / MSW
  95. * P = F distribution CDF above F with degrees of freedom dfB and dfW
  96. *
  97. * @param array<float> ...$samples Samples to analyze (at least 3 or more samples)
  98. *
  99. * @return array<string, mixed> [
  100. * ANOVA => [
  101. * treatment => [SS, df, MS, F, P],
  102. * error => [SS, df, MS],
  103. * total => [SS, df],
  104. * ],
  105. * total_summary => [n, sum, mean, SS, variance, sd, sem],
  106. * data_summary => [
  107. * 0 => [n, sum, mean, SS, variance, sd, sem],
  108. * 1 => [n, sum, mean, SS, variance, sd, sem],
  109. * ...
  110. * ]
  111. * ]
  112. *
  113. * @throws Exception\BadDataException if less than three samples, or if all samples don't have the same number of values
  114. * @throws Exception\OutOfBoundsException
  115. */
  116. public static function oneWay(array ...$samples): array
  117. {
  118. // Must have at least three samples
  119. $m = \count($samples);
  120. if ($m < 3) {
  121. throw new Exception\BadDataException('Must have at least three samples');
  122. }
  123. // All samples must have the same number of items
  124. $n = \count($samples[0]);
  125. for ($i = 1; $i < $m; $i++) {
  126. if (\count($samples[$i]) !== $n) {
  127. throw new Exception\BadDataException('All samples must have the same number of values');
  128. }
  129. }
  130. // Summary data for each sample
  131. $summary_data = [];
  132. foreach ($samples as $i => $sample) {
  133. $summary_data[$i] = [];
  134. $summary_data[$i]['n'] = $n;
  135. $summary_data[$i]['sum'] = \array_sum($sample);
  136. $summary_data[$i]['mean'] = Average::mean($sample);
  137. $summary_data[$i]['SS'] = RandomVariable::sumOfSquares($sample);
  138. $summary_data[$i]['variance'] = Descriptive::sampleVariance($sample);
  139. $summary_data[$i]['sd'] = Descriptive::sd($sample);
  140. $summary_data[$i]['sem'] = RandomVariable::standardErrorOfTheMean($sample);
  141. }
  142. // Totals summary
  143. $all_elements = \array_reduce(
  144. $samples,
  145. function ($merged, $sample) {
  146. return \array_merge($merged, $sample);
  147. },
  148. array()
  149. );
  150. $μ = Average::mean($all_elements);
  151. $total = [
  152. 'n' => \count($all_elements),
  153. 'sum' => \array_sum($all_elements),
  154. 'mean' => $μ,
  155. 'SS' => RandomVariable::sumOfSquares($all_elements),
  156. 'variance' => Descriptive::sampleVariance($all_elements),
  157. 'sd' => Descriptive::sd($all_elements),
  158. 'sem' => RandomVariable::standardErrorOfTheMean($all_elements),
  159. ];
  160. // ANOVA sum of squares
  161. $SST = RandomVariable::sumOfSquaresDeviations($all_elements);
  162. $SSB = \array_sum(\array_map(
  163. function ($sample) use ($n, $μ) {
  164. return $n * (Average::mean($sample) - $μ) ** 2;
  165. },
  166. $samples
  167. ));
  168. $SSW = \array_sum(\array_map(
  169. 'MathPHP\Statistics\RandomVariable::sumOfSquaresDeviations',
  170. $samples
  171. ));
  172. // ANOVA degrees of freedom
  173. $dfT = $m * $n - 1;
  174. $dfB = $m - 1;
  175. $dfW = $m * ($n - 1);
  176. // ANOVA mean squares
  177. $MSB = $SSB / $dfB;
  178. $MSW = $SSW / $dfW;
  179. // Test statistics
  180. $F = $MSB / $MSW;
  181. $fDist = new F($dfB, $dfW);
  182. $P = $fDist->above($F);
  183. // Return ANOVA report
  184. return [
  185. 'ANOVA' => [
  186. 'treatment' => [
  187. 'SS' => $SSB,
  188. 'df' => $dfB,
  189. 'MS' => $MSB,
  190. 'F' => $F,
  191. 'P' => $P,
  192. ],
  193. 'error' => [
  194. 'SS' => $SSW,
  195. 'df' => $dfW,
  196. 'MS' => $MSW,
  197. ],
  198. 'total' => [
  199. 'SS' => $SST,
  200. 'df' => $dfT,
  201. ],
  202. ],
  203. 'total_summary' => $total,
  204. 'data_summary' => $summary_data,
  205. ];
  206. }
  207. /**
  208. * Two-way ANOVA
  209. * Examines the influence of two different categorical independent variables on
  210. * one continuous dependent variable. The two-way ANOVA not only aims at assessing
  211. * the main effect of each independent variable but also if there is any interaction
  212. * between them (using the F distribution).
  213. * https://en.wikipedia.org/wiki/Two-way_analysis_of_variance
  214. *
  215. * Produces the following analysis of the data:
  216. *
  217. * ANOVA hypothesis test summary data
  218. *
  219. * | SS | df | MS | F | P |
  220. * Factor A | | | | | |
  221. * Factor B | | | | | |
  222. * Interaction | | | | | |
  223. * Error | | | |
  224. * Total | | |
  225. *
  226. * where:
  227. * Interaction = Factor A X Factor B working together
  228. * Error is within groups
  229. * SS = Sum of squares
  230. * df = Degrees of freedom
  231. * MS = Mean squares
  232. * F = F statistic
  233. * P = P value
  234. *
  235. * Data summary tables for:
  236. * Factor A
  237. * Factor B
  238. * Factor AB (Interaction)
  239. * Total
  240. *
  241. * | N | Sum | Mean | SS | Variance | SD | SEM |
  242. * 0 | | | | | | | |
  243. * 1 | | | | | | | |
  244. * ... | | | | | | | |
  245. * Total | | | | | | | |
  246. *
  247. * where:
  248. * Each row is the summary for a sample, numbered from 0 to m - 1
  249. * m = Number of samples
  250. * N = Sample size
  251. * SS = Sum of squares
  252. * SD = Standard deviation
  253. * SEM = Standard error of the mean
  254. *
  255. * Calculations
  256. *
  257. * Sum of Squares
  258. * SST (sum of squares total)
  259. * ∑⟮xᵢ − μ⟯²
  260. * where:
  261. * xᵢ = each element of all samples
  262. * μ = mean total of all elements of all samples
  263. *
  264. * SSA, SSB (sum of squares for each factor A and B)
  265. * ∑n(x - μ)²
  266. * where:
  267. * n = sample size
  268. * x = sample mean
  269. * μ = mean total of all elements of all samples
  270. *
  271. * SSW (sum of squares within - error)
  272. * ∑∑⟮x − μ⟯² Sum of sum of squared deviations of each sample
  273. * where:
  274. * x = mean of each AB
  275. * μ = mean of the sample
  276. *
  277. * SSAB (sum of squares AB - interaction)
  278. * SSAB = SST - SSA - SSB - SSW;
  279. *
  280. * Degrees of Freedom
  281. * dfT (degrees of freedom for the total)
  282. * n - 1
  283. *
  284. * dfA (degrees of freedom factor A)
  285. * r - 1
  286. *
  287. * dfB (degrees of freedom factor B)
  288. * c - 1
  289. *
  290. * dfAB (degrees of freedom factor AB - interaction)
  291. * (r - 1)(c - 1)
  292. *
  293. * dfW (degrees of freedom within - error)
  294. * n - rc
  295. *
  296. * where:
  297. * n = number of samples
  298. * r = number of rows (number of factor As)
  299. * c = number of columns (number of factor Bs)
  300. *
  301. * Mean Squares
  302. * MSA (Mean squares factor A)
  303. * SSA / dfA
  304. *
  305. * MSB (Mean squares factor B)
  306. * SSB / dfB
  307. *
  308. * MSAB (Mean squares factor AB - interaction)
  309. * SSAB / dfAB
  310. *
  311. * MSW (Mean squares within - error)
  312. * SSW / dfW
  313. *
  314. * F Test Statistics
  315. * FA = MSA / MSW
  316. * FB = MSB / MSW
  317. * FAB = MSAB / MSW
  318. *
  319. * P values
  320. * PA = F distribution CDF above FA with degrees of freedom dfA and dfW
  321. * PB = F distribution CDF above FB with degrees of freedom dfA and dfW
  322. * PAB = F distribution CDF above FAB with degrees of freedom dfAB and dfW
  323. *
  324. * Example input data for ...$data parameter:
  325. * | Factor B₁ | Factor B₂ | ⋯
  326. * Factor A₁ | 4, 6, 8 | 6, 6, 9 | ⋯
  327. * Factor A₂ | 4, 8, 9 | 7, 10, 13 | ⋯
  328. * ⋮ ⋮ ⋮ ⋮
  329. * @param array<float[]> ...$data Samples to analyze [
  330. * // Factor A₁
  331. * [
  332. * [4, 6, 8] // Factor B₁
  333. * [6, 6, 9] // Factor B₂
  334. * ⋮
  335. * ],
  336. * // Factor A₂
  337. * [
  338. * [4, 8, 9] // Factor B₁
  339. * [7, 10, 13] // Factor B₂
  340. * ⋮
  341. * ],
  342. * ...
  343. * ]
  344. *
  345. * @return array<string, mixed> [
  346. * ANOVA => [
  347. * factorA => [SS, df, MS, F, P],
  348. * factorB => [SS, df, MS, F, P],
  349. * factorAB => [SS, df, MS, F, P],
  350. * error => [SS, df, MS],
  351. * total => [SS, df],
  352. * ],
  353. * total_summary => [n, sum, mean, SS, variance, sd, sem],
  354. * summary_factorA => [
  355. * 0 => [n, sum, mean, SS, variance, sd, sem],
  356. * 1 => [n, sum, mean, SS, variance, sd, sem],
  357. * ...
  358. * ],
  359. * summary_factorB => [
  360. * 0 => [n, sum, mean, SS, variance, sd, sem],
  361. * 1 => [n, sum, mean, SS, variance, sd, sem],
  362. * ...
  363. * ],
  364. * summary_factorAB => [
  365. * 0 => [n, sum, mean, SS, variance, sd, sem],
  366. * 1 => [n, sum, mean, SS, variance, sd, sem],
  367. * ...
  368. * ]
  369. * ]
  370. * @throws Exception\BadDataException if less than two A factors, or if B factors or values have different number elements
  371. * @throws Exception\OutOfBoundsException
  372. */
  373. public static function twoWay(array ...$data): array
  374. {
  375. // Must have at least two rows (two types of factor A)
  376. $r = \count($data);
  377. if ($r < 2) {
  378. throw new Exception\BadDataException('Must have at least two rows (two types of factor A)');
  379. }
  380. // All samples must have the same number the second factor B
  381. $c = \count($data[0]);
  382. for ($i = 1; $i < $r; $i++) {
  383. if (\count($data[$i]) !== $c) {
  384. throw new Exception\BadDataException('All samples must have the same number of the second factor B');
  385. }
  386. }
  387. // Each AB factor interaction must have the same number of values
  388. $v = \count($data[0][0]);
  389. for ($i = 0; $i < $r; $i++) {
  390. for ($j = 0; $j < $c; $j++) {
  391. if (\count($data[$i][$j]) !== $v) {
  392. throw new Exception\BadDataException('Each AB factor interaction must have the same number of values');
  393. }
  394. }
  395. }
  396. // Aggregates for all elements, rows (factor A), and columns (factor B)
  397. $all_elements = [];
  398. $A_elements = [];
  399. $B_elements = [];
  400. // Summaries for factor A, factor B, and AB
  401. $summary_A = [];
  402. $summary_B = [];
  403. $summary_AB = [];
  404. // Summary data for each AB
  405. // And aggregate all elements and elements for factor A
  406. foreach ($data as $A => $Bs) {
  407. $A_elements[$A] = [];
  408. foreach ($Bs as $B => $values) {
  409. // Aggregates
  410. $all_elements = \array_merge($all_elements, $values);
  411. $A_elements[$A] = \array_merge($A_elements[$A], $values);
  412. // AB summary
  413. $summary_AB[$A][$B] = [];
  414. $summary_AB[$A][$B]['n'] = $c;
  415. $summary_AB[$A][$B]['sum'] = \array_sum($values);
  416. $summary_AB[$A][$B]['mean'] = Average::mean($values);
  417. $summary_AB[$A][$B]['SS'] = RandomVariable::sumOfSquares($values);
  418. $summary_AB[$A][$B]['variance'] = Descriptive::sampleVariance($values);
  419. $summary_AB[$A][$B]['sd'] = Descriptive::sd($values);
  420. $summary_AB[$A][$B]['sem'] = RandomVariable::standardErrorOfTheMean($values);
  421. }
  422. }
  423. // Aggregate elements for factor B
  424. for ($B = 0; $B < $c; $B++) {
  425. $B_elements[$B] = [];
  426. foreach ($data as $factor1s) {
  427. $B_elements[$B] = \array_merge($B_elements[$B], $factor1s[$B]);
  428. }
  429. }
  430. // Factor A summary
  431. foreach ($A_elements as $A => $elements) {
  432. $summary_A[$A] = [];
  433. $summary_A[$A]['n'] = \count($elements);
  434. $summary_A[$A]['sum'] = \array_sum($elements);
  435. $summary_A[$A]['mean'] = Average::mean($elements);
  436. $summary_A[$A]['SS'] = RandomVariable::sumOfSquares($elements);
  437. $summary_A[$A]['variance'] = Descriptive::sampleVariance($elements);
  438. $summary_A[$A]['sd'] = Descriptive::sd($elements);
  439. $summary_A[$A]['sem'] = RandomVariable::standardErrorOfTheMean($elements);
  440. }
  441. // Factor B summary
  442. foreach ($B_elements as $B => $elements) {
  443. $summary_B[$B] = [];
  444. $summary_B[$B]['n'] = \count($elements);
  445. $summary_B[$B]['sum'] = \array_sum($elements);
  446. $summary_B[$B]['mean'] = Average::mean($elements);
  447. $summary_B[$B]['SS'] = RandomVariable::sumOfSquares($elements);
  448. $summary_B[$B]['variance'] = Descriptive::sampleVariance($elements);
  449. $summary_B[$B]['sd'] = Descriptive::sd($elements);
  450. $summary_B[$B]['sem'] = RandomVariable::standardErrorOfTheMean($elements);
  451. }
  452. // Totals summary
  453. $μ = Average::mean($all_elements);
  454. $summary_total = [
  455. 'n' => \count($all_elements),
  456. 'sum' => \array_sum($all_elements),
  457. 'mean' => $μ,
  458. 'SS' => RandomVariable::sumOfSquares($all_elements),
  459. 'variance' => Descriptive::sampleVariance($all_elements),
  460. 'sd' => Descriptive::sd($all_elements),
  461. 'sem' => RandomVariable::standardErrorOfTheMean($all_elements),
  462. ];
  463. // Sum of squares factor A
  464. $SSA = \array_sum(\array_map(
  465. function ($f1) use ($μ) {
  466. return $f1['n'] * ($f1['mean'] - $μ) ** 2;
  467. },
  468. $summary_A
  469. ));
  470. // Sum of squares factor B
  471. $SSB = \array_sum(\array_map(
  472. function ($B) use ($μ) {
  473. return $B['n'] * ($B['mean'] - $μ) ** 2;
  474. },
  475. $summary_B
  476. ));
  477. // Sum of squares within (error)
  478. $SSW = 0;
  479. foreach ($data as $A => $Bs) {
  480. foreach ($Bs as $B => $values) {
  481. foreach ($values as $value) {
  482. $SSW += ($value - $summary_AB[$A][$B]['mean']) ** 2;
  483. }
  484. }
  485. }
  486. // Sum of squares total
  487. $SST = 0;
  488. foreach ($data as $A => $Bs) {
  489. foreach ($Bs as $B => $values) {
  490. foreach ($values as $value) {
  491. $SST += ($value - $μ) ** 2;
  492. }
  493. }
  494. }
  495. // Sum of squares AB interaction
  496. $SSAB = $SST - $SSA - $SSB - $SSW;
  497. // Degrees of freedom
  498. $dfA = $r - 1;
  499. $dfB = $c - 1;
  500. $dfAB = ($r - 1) * ($c - 1);
  501. $dfW = $summary_total['n'] - ($r * $c);
  502. $dfT = $summary_total['n'] - 1;
  503. // Mean squares
  504. $MSA = $SSA / $dfA;
  505. $MSB = $SSB / $dfB;
  506. $MSAB = $SSAB / $dfAB;
  507. $MSW = $SSW / $dfW;
  508. // F test statistics
  509. $FA = $MSA / $MSW;
  510. $FB = $MSB / $MSW;
  511. $FAB = $MSAB / $MSW;
  512. // P values
  513. $fDist1 = new F($dfA, $dfW);
  514. $fDist2 = new F($dfB, $dfW);
  515. $fDist3 = new F($dfAB, $dfW);
  516. $PA = $fDist1->above($FA);
  517. $PB = $fDist2->above($FB);
  518. $PAB = $fDist3->above($FAB);
  519. // Return ANOVA report
  520. return [
  521. 'ANOVA' => [
  522. 'factorA' => [
  523. 'SS' => $SSA,
  524. 'df' => $dfA,
  525. 'MS' => $MSA,
  526. 'F' => $FA,
  527. 'P' => $PA,
  528. ],
  529. 'factorB' => [
  530. 'SS' => $SSB,
  531. 'df' => $dfB,
  532. 'MS' => $MSB,
  533. 'F' => $FB,
  534. 'P' => $PB,
  535. ],
  536. 'interaction' => [
  537. 'SS' => $SSAB,
  538. 'df' => $dfAB,
  539. 'MS' => $MSAB,
  540. 'F' => $FAB,
  541. 'P' => $PAB,
  542. ],
  543. 'error' => [
  544. 'SS' => $SSW,
  545. 'df' => $dfW,
  546. 'MS' => $MSW,
  547. ],
  548. 'total' => [
  549. 'SS' => $SST,
  550. 'df' => $dfT,
  551. ],
  552. ],
  553. 'total_summary' => $summary_total,
  554. 'summary_factorA' => $summary_A,
  555. 'summary_factorB' => $summary_B,
  556. 'summary_interaction' => $summary_AB,
  557. ];
  558. }
  559. }