(Analysis by Sujay Konda)

Note: We will use 0-indexing instead of 1-indexing.

When cows bouncing off of each other, it is equivalent to the cows continuing on their original paths but their labels swapping. This means the problem reduces to finding what path label 0 ends up at after all the label swaps. Call the path of cow $i$ going clockwise the CW-path $i$, and counterclockwise the CCW-path $i$.

Another important observation is that the cyclic order of the cows stays the same, so cow $1$ is clockwise of cow $0$, cow $2$ is clockwise of cow $1$, and so on, including cow $0$ being clockwise of cow $N-1$. This means that whenever a collision/label swap occurs, the two labels are consecutive mod $N$ (so one path's label increases by one and the other decreases by one). More specifically, when a CW-path and a CCW-path intersect, the label of the CW-path increases by $1$, and the label of the CCW-path decreases by $1$ (mod $N$).

For each of the $2N$ possible paths (one for each cow going clockwise or counterclockwise), we want to find the probability that label $0$ ends up at that path. Let's focus on just the $N$ possible CW-paths (as you can reverse everything to solve the counterclockwise case). Consider the CW-path $i$. Let the number of paths it intersects with be $T$. Then, the path will end up with label $i + T$, and for it to end up with label $0$, we want $i + T \equiv 0 \pmod N$, which means $T \equiv -i \pmod N$. Also note that CW-path $i$ will intersect with all CCW cows within the range $[x_i, x_i + 2K]$ (wrapping around and recounting if $2K \geq M$).

Let $L =$ # of full wraps (of size $M$) around the circle, $E =$ the remaining arc length leftover. Also let, $A =$ # of cows going counterclockwise in total, and $B =$ # of cows going counterclockwise in the range $[x_i, x_i + E]$. Then $L = \lfloor 2K/M \rfloor$, $E = 2K \bmod M$. Since every full wrap contributes $A$ intersections, and the remaining arc contributes $B$, $T = LA + B$. Since we want $T \equiv -i \pmod N$, for every $A = 0, \dots, N - 1$, $B = (-i-LA) \bmod N$ ($B < N$, so $1$ possible value of $B$ works). Define $C = A - B$, (i.e. $C$ is the number of CCW cows outside the leftover arc). Now, to find the probability that this path contains label $0$, we just need to find the probability there are $B$ CCW cows inside $[x_i, x_i + E]$, and $C$ CCW cows outside $[x_i, x_i + E]$.

This leads to a $O(N^3)$ knapsack DP, where we maintain $dp_i =$ probability of having exactly $i$ cows going counterclockwise. When adding a cow to the dp, we do $ndp_i = (1 - p_i) \cdot dp_{i - 1} + p_i \cdot dp_i$. We have one dp for everything inside $[x_i, x_i + E]$ (call this $in$), and another for everything outside (call this $out$), and then the probability label $0$ ends up at CW-path $i$ is

$$\sum_{\text{over all $(B, C)$ pairs}} in_B \cdot out_C.$$
If label $0$ is on CW-path $i$, then its final position is $(x_i + K) \bmod M$, so we can update the answer accordingly.

To optimize this to $O(N^2)$, notice that this DP is the same as polynomial multiplication. Let $F(x) = \sum dp_i x^i$, then when adding a cow, $\sum ndp_i x^i = \sum ((1 - p_i) \cdot dp_{i - 1} + p_i \cdot dp_i) x^i =F(x) \cdot ((1 - p_i)x + p_i)$. Also note that the cows we are adding (everything inside $[x_i, x_i + E]$ and everything outside $[x_i, x_i + E]$ are all circular arcs). Therefore, the problem reduces to quickly finding

$$\prod_{i \in [l, r]} ((1 - p_i)x + p_i),$$
where $[l, r]$ is some circular arc. Instead of dealing with circular arcs, we turn them into ranges by duplicating the probabilities and positions of them so cow $i+N$ = cow $i$.

Now let's analyze the all the cows within $[x_i, x_i + E]$. Let $r_i$ be the last cow whose position is $\leq x_i + E$. Note that we can maintain $r_i$ with two pointers (incrementing $r_i$ whenever $r_i + 1$ fits in $[x_i, x_i + E]$). Then the respective range of cows is $[i, r_i]$. Then the cows on the outside of $[x_i, x_i + E]$ are $[r_i + 1, i + N - 1]$. Suppose we maintained two polynomials, $P_{in}$ and $P_{out}$. As we increment $i$, $r_i$ increases, and we can add those cows to $P_{in}$ with polynomial multiplication, but we also have to remove the previous $i$. We do this with polynomial division, specifically dividing by $((1 - p_i)x + p_i)$. We can do something similar to maintain $P_{out}$ (dividing when $r_i$ increases and adding $i + N - 1$ when $i$ increases). Since $i$ and $r_i$ increment at most $N$ times, and the polynomial division and multiplication takes $O(N)$ time, this will overall take $O(N^2)$ time. Then taking the sum over all $(B, C)$ pairs takes $O(N)$ time since there are $N$ pairs, giving us a final time complexity of $O(N^2)$.

My code:

#include <bits/stdc++.h>
using namespace std;

using ll = long long;

const int MOD = 1e9 + 7;

int bpow(int x, int y) {
    return (y == 0) ? 1 : ((ll)bpow((ll)x * x % MOD, y / 2) * ((y % 2) ? x : 1) % MOD);
}

void mulp(vector<int>& poly, int pr) {
    int npr = (MOD + 1 - pr) % MOD;
    if(npr == 0) return;
    poly.push_back(0);
    for(int i = poly.size() - 2; i >= 0; i--) {
        poly[i + 1] = (poly[i + 1] + (ll)npr * poly[i]) % MOD;
        poly[i] = ((ll)pr * poly[i]) % MOD;
    }
}
void divp(vector<int>& poly, int pr, int ipr) {
    int npr = (MOD + 1 - pr) % MOD;
    if(npr == 0) return;
    int v = poly.back();
    for(int i = poly.size() - 1; i >= 1; i--) {
        int nv = poly[i - 1];
        poly[i - 1] = (ll)v * ipr % MOD;
        v = (nv - (ll)poly[i - 1] * pr) % MOD;
        if(v < 0) v += MOD;
    }
    assert(v == 0);
    poly.pop_back();
}

void tc() {
    int N, M; cin >> N >> M;
    ll K; cin >> K;

    ll L = ((2 * K) / M) % N;
    ll E = (2 * K) % M;

    vector<int> p(N), x(N);
    for(int i = 0; i < N; i++) cin >> p[i];
    for(int i = 0; i < N; i++) cin >> x[i];

    auto get = [&] (int i) {
        if(i >= N) return x[i - N] + M;
        else return x[i];
    };

    auto solveR = [&] () {
        vector<int> ip(N);
        for(int i = 0; i < N; i++) ip[i] = bpow(MOD + 1 - p[i], MOD - 2);

        int j = 0;
        vector<int> ans(M);
        vector<int> cpoly(1, 1), opoly(1, 1);
        for(int i = 0; i < N; i++) {
            mulp(opoly, p[i]);
        }
        for(int i = 0; i < N; i++) {
            if(i > 0)
                mulp(opoly, p[i - 1]);
            while(get(j) - get(i) <= E) {
                mulp(cpoly, p[j % N]);
                divp(opoly, p[j % N], ip[j % N]);
                j++;
            }
            divp(cpoly, p[i], ip[i]);

            int cans = 0;
            int clc = (N - i) % N;
            for(int lc = 0; lc < N; lc++) {
                int olc = lc - clc;
                if(clc < cpoly.size() && olc >= 0 && olc < opoly.size()) {
                    cans = (cans + (ll)cpoly[clc] * opoly[olc]) % MOD;
                }
                clc -= L;
                if(clc < 0) clc += N;
            }
            int endp = ((x[i] + K) % M + M) % M;
            ans[endp] = (ans[endp] + (ll)p[i] * cans) % MOD;
        }   
        return ans;
    };

    vector<int> ans = solveR();

    for(int i = 0; i < N; i++) {
        x[i] = (M - x[i]) % M;
        p[i] = (1 - p[i] + MOD) % MOD;
    }
    reverse(x.begin() + 1, x.end());
    reverse(p.begin() + 1, p.end());

    vector<int> ans2 = solveR();

    for(int i = 0; i < M; i++) {
        ans[i] = (ans[i] + ans2[(M - i) % M]) % MOD;
        if(i > 0) cout << " ";
        cout << ans[i];
    }
    cout << '\n';
}

int main() {
    ios::sync_with_stdio(false), cin.tie(nullptr);
    int T; cin >> T;
    while(T--) tc();
}