(Analysis by Charlie Yang)

Subtask 1:

In this subtask, buckets can only contain $0$ gallons or $1$ gallon.

Firstly, if all buckets contain $1$ gallon, the answer is clearly $0$.

Next, if all buckets except for one contain $1$ gallon, the answer is still $0$, and the optimal merging strategy is to take the bucket with $0$ gallons initially and merge it with either the left or right one, until only one bucket is left.

We now assume that at least two buckets contain $0$ gallons.

Intuitively, we want to first merge all the buckets with $0$ gallons together, since that reduces the number of buckets with $0$ gallons. Therefore, in the swapping phase, we want to move all the $0$s together into one subarray, such that after merging the $0$s until only one is left, it is reduced to the second case mentioned above.

This can be done by an $O(N^2)$ algorithm by trying every possible position of the subarray containing $0$s.

Sujay's code:

#include <bits/stdc++.h>
using namespace std;
 
int main() {
    ios::sync_with_stdio(false), cin.tie(nullptr);
    int T; cin >> T;
    while(T--) {
        int N; cin >> N;
        vector<int> a(N);
        for(int i = 0; i < N; i++) cin >> a[i];
        vector<int> zeros;
        for(int i = 0; i < N; i++) {
            if(a[i] == 0) zeros.push_back(i);
        }
        long long mncost = 1e18;
        // group up all the 0s
        for(int i = 0; i <= N - zeros.size(); i++) {
            long long cost = 0;
            for(int j = 0; j < zeros.size(); j++) {
                cost += abs(i + j - zeros[j]);
            }
            mncost = min(mncost, cost);
        }
        cout << mncost << endl;
    }
}

Subtask 2:

To improve the complexity of the previous algorithm, we notice that intuitively, subarrays too far to the left or to the right are unlikely to be the configuration that requires the fewest swaps.

In fact, we can show that one subarray position that requires fewest swaps can be constructed by fixing the $\lfloor\frac{k}{2}\rfloor+1$-th $0$ and moving all the other $0$s next to it, where $k$ is the number of $0$s in the array.

The time complexity is $O(N)$.

Sujay's code:

#include <bits/stdc++.h>
using namespace std;
 
int main() {
    ios::sync_with_stdio(false), cin.tie(nullptr);
    int T; cin >> T;
    while(T--) {
        int N; cin >> N;
        vector<int> a(N);
        for(int i = 0; i < N; i++) cin >> a[i];
        vector<int> zeros;
        for(int i = 0; i < N; i++) {
            if(a[i] == 0) zeros.push_back(i);
        }
        if(zeros.size() == 0) {
            cout << 0 << endl;
            continue;
        }
        int middle = zeros[zeros.size() / 2];
        long long cost = 0;
        for(int i = 0; i < zeros.size(); i++) {
            cost += abs(middle - zeros[i]) - abs((int)zeros.size() / 2 - i);
        }
        cout << cost << endl;
    }
}

Subtask 3:

To solve this, we first need to understand Phase 2. Every time we merge two buckets, we are essentially building a binary tree. The final amount of milk is the sum of each bucket $a_i$ multiplied by $2^{-d_i}$, where $d_i$ is how many times that bucket was merged (its depth in the tree).

To maximize the result, we need the largest values to have the smallest depths. So, to preserve the largest values for last, we observe that when merging, it is always optimal to merge the smallest two values. The resulting value would then be $\sum_{i=1}^n 2^{-i}b_i+2^{-n}b_n$, where $b$ is $a$ sorted from largest to smallest, which is maximal.

Reconsidering the first phase, we see that we should swap the array such that it is first non-increasing, then non-decreasing, so that the smallest two elements are always adjacent in the merging phase.

To calculate the minimum of swaps to make the array non-increasing followed by non-decreasing, we can iterate from the smallest value to the largest value in the array, and for each value, considering only the elements with strictly smaller value, we choose to either swap it to the left side or the right side.

This gives us an $O(N^2)$ algorithm.

Sujay's code:

#include <bits/stdc++.h>
using namespace std;
 
int main() {
    ios::sync_with_stdio(false), cin.tie(nullptr);
    int T; cin >> T;
    while(T--) {
        int N; cin >> N;
        vector<int> a(N);
        for(int i = 0; i < N; i++) cin >> a[i];
        map<int, vector<int>> mp;
        for(int i = 0; i < N; i++) mp[a[i]].push_back(i);
        long long ans = 0;
        for(auto [u, v] : mp) {
            for(int i : v) {
                int lcnt = 0;
                for(int j = 0; j < i; j++) lcnt += a[j] < u;
                int rcnt = 0;
                for(int j = i + 1; j < N; j++) rcnt += a[j] < u;
                ans += min(lcnt, rcnt);
            }
        }
        cout << ans << endl;
    }
}

Full solution:

To improve on the previous algorithm, we can see that if we maintain an array $b$ where $b_i=1$ if the $i$-th bucket has smaller value than the current bucket and $b_i=0$ otherwise, then the cost to move the current bucket to the left is $\sum_{i=1}^k b_i$, and the cost the move the current bucket to the right is $\sum_{i=k}^n b_i$, with $k$ being the position of the current bucket. This can be modeled by either a binary indexed tree or a segment tree in $O(\log n)$ time per update/query.

Therefore, this gives us an $O(N\log N)$ algorithm.

Sujay's code:

#include <bits/stdc++.h>
using namespace std;
 
struct BIT {
    int n;
    vector<int> t;
    BIT(int _n) : n(_n), t(n + 1) {}
 
    void add(int k, int x) {
        while(k <= n) {
            t[k] += x;
            k += k&-k;
        }
    }
 
    int sum(int r) {
        int sm = 0;
        while(r > 0) {
            sm += t[r];
            r -= r&-r;
        }
        return sm;
    }
};
 
void tc() {
    int N; cin >> N;
 
    vector<int> a(N);
    for(int i = 0; i < N; i++) cin >> a[i];
    vector<int> o(N); iota(o.begin(), o.end(), 0);
    sort(o.begin(), o.end(), [&] (int x, int y) {
        return a[x] < a[y];
    });
 
    BIT bit(N);
    int last = -1;
    vector<int> upds;
    long long ans = 0;
    for(int i : o) {
        if(last != a[i]) {
            for(int u : upds) {
                bit.add(u, 1);
            }
            upds.clear();
        }
        ans += min(bit.sum(N) - bit.sum(i + 1), bit.sum(i + 1));
        last = a[i];
        upds.push_back(i + 1);
    }
    cout << ans << endl;
}
 
int main() {
    ios::sync_with_stdio(false), cin.tie(nullptr);
    int T; cin >> T;
 
    while(T--) {
        tc();
    }
}

Alternatively, we can use an ordered set given by the policy-based data structure '__gnu_pbds::tree' in C++.

#include <bits/extc++.h>
using namespace std;
using namespace __gnu_pbds;
using oset=tree<int, null_type, less<int>, rb_tree_tag,
tree_order_statistics_node_update>;

int main(){
    cin.tie(0) -> sync_with_stdio(0);
    int T;
    cin >> T;
    while (T--) {
        int N;
        cin >> N;
        vector<int> a(N);
        oset t;
        map<int, vector<int>> mp;
        for (int i = 0; i < N; i++) {
            cin >> a[i];
            mp[a[i]].push_back(i);
        }
        long long ans = 0;
        for (auto [k, v] : mp) {
            for (int x : v) {
                int pos = t.order_of_key(x);
                ans += min(pos, (int)t.size() - pos);
            }
            for (int x : v) t.insert(x);
        }
        cout << ans << '\n';
    }
}